import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, roc_auc_score
from mlxtend.plotting import category_scatter, plot_learning_curves, plot_decision_regions
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("Californa_Wine_Production_1980_2020.csv")
data
| Year | CommodityCode | CropName | CountyCode | County | HarvestedAcres | Yield(Unit/Acre) | Production | Price(Dollars/Unit) | Unit | Value(Dollars) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 216299 | GRAPESWINE | 1 | Alameda | 2530.0 | 5.14 | 13000.0 | 1497.69 | Tons | 19470000 |
| 1 | 2020 | 216299 | GRAPESWINE | 5 | Amador | 5360.0 | 2.31 | 12400.0 | 1318.31 | Tons | 16347000 |
| 2 | 2020 | 216299 | GRAPESWINE | 9 | Calaveras | 579.0 | 3.06 | 1770.0 | 1325.99 | Tons | 2347000 |
| 3 | 2020 | 216299 | GRAPESWINE | 11 | Colusa | 747.0 | 6.02 | 4500.0 | 684.67 | Tons | 3081000 |
| 4 | 2020 | 216299 | GRAPESWINE | 13 | ContraCosta | 1940.0 | 4.69 | 9090.0 | 751.27 | Tons | 6829000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1310 | 1980 | 216299 | GRAPESWINE | 95 | Solano | 1138.0 | 3.99 | 4544.0 | 315.00 | TONS | 1433300 |
| 1311 | 1980 | 216299 | GRAPESWINE | 97 | Sonoma | 23639.0 | 3.34 | 78941.0 | 506.00 | TONS | 39982000 |
| 1312 | 1980 | 216299 | GRAPESWINE | 99 | Stanislaus | 17950.0 | 8.80 | 157900.0 | 183.00 | TONS | 28848000 |
| 1313 | 1980 | 216299 | GRAPESWINE | 107 | Tulare | 15159.0 | 8.88 | 134600.0 | 170.00 | TONS | 22902000 |
| 1314 | 1980 | 216299 | GRAPESWINE | 113 | Yolo | 566.0 | 8.70 | 4924.0 | 274.00 | TONS | 1351000 |
1315 rows × 11 columns
data.rename(columns = {'Yield(Unit/Acre)':'Yield', 'Price(Dollars/Unit)':'Price', 'Value(Dollars)':'Value'}, inplace = True)
data.Unit = "TONS"
data
| Year | CommodityCode | CropName | CountyCode | County | HarvestedAcres | Yield | Production | Price | Unit | Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | 216299 | GRAPESWINE | 1 | Alameda | 2530.0 | 5.14 | 13000.0 | 1497.69 | TONS | 19470000 |
| 1 | 2020 | 216299 | GRAPESWINE | 5 | Amador | 5360.0 | 2.31 | 12400.0 | 1318.31 | TONS | 16347000 |
| 2 | 2020 | 216299 | GRAPESWINE | 9 | Calaveras | 579.0 | 3.06 | 1770.0 | 1325.99 | TONS | 2347000 |
| 3 | 2020 | 216299 | GRAPESWINE | 11 | Colusa | 747.0 | 6.02 | 4500.0 | 684.67 | TONS | 3081000 |
| 4 | 2020 | 216299 | GRAPESWINE | 13 | ContraCosta | 1940.0 | 4.69 | 9090.0 | 751.27 | TONS | 6829000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1310 | 1980 | 216299 | GRAPESWINE | 95 | Solano | 1138.0 | 3.99 | 4544.0 | 315.00 | TONS | 1433300 |
| 1311 | 1980 | 216299 | GRAPESWINE | 97 | Sonoma | 23639.0 | 3.34 | 78941.0 | 506.00 | TONS | 39982000 |
| 1312 | 1980 | 216299 | GRAPESWINE | 99 | Stanislaus | 17950.0 | 8.80 | 157900.0 | 183.00 | TONS | 28848000 |
| 1313 | 1980 | 216299 | GRAPESWINE | 107 | Tulare | 15159.0 | 8.88 | 134600.0 | 170.00 | TONS | 22902000 |
| 1314 | 1980 | 216299 | GRAPESWINE | 113 | Yolo | 566.0 | 8.70 | 4924.0 | 274.00 | TONS | 1351000 |
1315 rows × 11 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1315 entries, 0 to 1314 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 1315 non-null int64 1 CommodityCode 1315 non-null int64 2 CropName 1315 non-null object 3 CountyCode 1315 non-null int64 4 County 1315 non-null object 5 HarvestedAcres 1302 non-null float64 6 Yield 1266 non-null float64 7 Production 1278 non-null float64 8 Price 1278 non-null float64 9 Unit 1315 non-null object 10 Value 1315 non-null int64 dtypes: float64(4), int64(4), object(3) memory usage: 113.1+ KB
data.describe()
| Year | CommodityCode | CountyCode | HarvestedAcres | Yield | Production | Price | Value | |
|---|---|---|---|---|---|---|---|---|
| count | 1315.000000 | 1315.0 | 1315.000000 | 1302.000000 | 1266.000000 | 1.278000e+03 | 1278.000000 | 1.315000e+03 |
| mean | 2001.171103 | 216299.0 | 59.051711 | 14546.443164 | 5.176288 | 1.023638e+05 | 928.027363 | 5.961252e+07 |
| std | 11.751631 | 0.0 | 31.083778 | 20576.581018 | 3.267309 | 1.711560e+05 | 1244.949783 | 1.237376e+08 |
| min | 1980.000000 | 216299.0 | 1.000000 | 3.000000 | 0.060000 | 2.300000e+01 | 74.000000 | 1.150000e+04 |
| 25% | 1991.000000 | 216299.0 | 33.000000 | 805.750000 | 2.940000 | 3.259750e+03 | 335.827500 | 1.600500e+06 |
| 50% | 2002.000000 | 216299.0 | 61.000000 | 3699.000000 | 4.040000 | 1.740000e+04 | 723.500000 | 1.220210e+07 |
| 75% | 2011.000000 | 216299.0 | 83.000000 | 23884.500000 | 7.185000 | 1.298860e+05 | 1236.887500 | 6.340450e+07 |
| max | 2020.000000 | 216299.0 | 115.000000 | 128613.000000 | 25.000000 | 1.040100e+06 | 36342.070000 | 2.030002e+09 |
data.describe(include='all')
| Year | CommodityCode | CropName | CountyCode | County | HarvestedAcres | Yield | Production | Price | Unit | Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1315.000000 | 1315.0 | 1315 | 1315.000000 | 1315 | 1302.000000 | 1266.000000 | 1.278000e+03 | 1278.000000 | 1315 | 1.315000e+03 |
| unique | NaN | NaN | 1 | NaN | 42 | NaN | NaN | NaN | NaN | 1 | NaN |
| top | NaN | NaN | GRAPESWINE | NaN | Alameda | NaN | NaN | NaN | NaN | TONS | NaN |
| freq | NaN | NaN | 1315 | NaN | 41 | NaN | NaN | NaN | NaN | 1315 | NaN |
| mean | 2001.171103 | 216299.0 | NaN | 59.051711 | NaN | 14546.443164 | 5.176288 | 1.023638e+05 | 928.027363 | NaN | 5.961252e+07 |
| std | 11.751631 | 0.0 | NaN | 31.083778 | NaN | 20576.581018 | 3.267309 | 1.711560e+05 | 1244.949783 | NaN | 1.237376e+08 |
| min | 1980.000000 | 216299.0 | NaN | 1.000000 | NaN | 3.000000 | 0.060000 | 2.300000e+01 | 74.000000 | NaN | 1.150000e+04 |
| 25% | 1991.000000 | 216299.0 | NaN | 33.000000 | NaN | 805.750000 | 2.940000 | 3.259750e+03 | 335.827500 | NaN | 1.600500e+06 |
| 50% | 2002.000000 | 216299.0 | NaN | 61.000000 | NaN | 3699.000000 | 4.040000 | 1.740000e+04 | 723.500000 | NaN | 1.220210e+07 |
| 75% | 2011.000000 | 216299.0 | NaN | 83.000000 | NaN | 23884.500000 | 7.185000 | 1.298860e+05 | 1236.887500 | NaN | 6.340450e+07 |
| max | 2020.000000 | 216299.0 | NaN | 115.000000 | NaN | 128613.000000 | 25.000000 | 1.040100e+06 | 36342.070000 | NaN | 2.030002e+09 |
data.dtypes
Year int64 CommodityCode int64 CropName object CountyCode int64 County object HarvestedAcres float64 Yield float64 Production float64 Price float64 Unit object Value int64 dtype: object
data = data.drop(columns=['CommodityCode','CropName','Unit','Value'])
data.duplicated().any()
False
data.duplicated().value_counts()
False 1315 dtype: int64
data.isnull().sum()
Year 0 CountyCode 0 County 0 HarvestedAcres 13 Yield 49 Production 37 Price 37 dtype: int64
data.columns[data.isnull().any()]
Index(['HarvestedAcres', 'Yield', 'Production', 'Price'], dtype='object')
data.isnull().value_counts()
Year CountyCode County HarvestedAcres Yield Production Price
False False False False False False False 1266
True True True 36
True True False False 12
True True 1
dtype: int64
data[data.HarvestedAcres.isna()].sort_values(['County','Year'])
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 1225 | 1983 | 57 | Nevada | NaN | NaN | 23.0 | 500.0 |
| 1193 | 1984 | 57 | Nevada | NaN | NaN | 46.0 | 496.0 |
| 1163 | 1985 | 57 | Nevada | NaN | NaN | 62.0 | 405.0 |
| 1134 | 1986 | 57 | Nevada | NaN | NaN | 125.0 | 435.0 |
| 1103 | 1987 | 57 | Nevada | NaN | NaN | 238.0 | 465.0 |
| 1072 | 1988 | 57 | Nevada | NaN | NaN | 223.0 | 502.0 |
| 1043 | 1989 | 57 | Nevada | NaN | NaN | 473.0 | 656.0 |
| 1015 | 1990 | 57 | Nevada | NaN | NaN | 527.0 | 928.0 |
| 592 | 2004 | 81 | SanMateo | NaN | NaN | NaN | NaN |
| 760 | 1999 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 728 | 2000 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 698 | 2001 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 666 | 2002 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
data[data.Yield.isna()].sort_values(['County','Year'])
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 1225 | 1983 | 57 | Nevada | NaN | NaN | 23.0 | 500.0 |
| 1193 | 1984 | 57 | Nevada | NaN | NaN | 46.0 | 496.0 |
| 1163 | 1985 | 57 | Nevada | NaN | NaN | 62.0 | 405.0 |
| 1134 | 1986 | 57 | Nevada | NaN | NaN | 125.0 | 435.0 |
| 1103 | 1987 | 57 | Nevada | NaN | NaN | 238.0 | 465.0 |
| 1072 | 1988 | 57 | Nevada | NaN | NaN | 223.0 | 502.0 |
| 1043 | 1989 | 57 | Nevada | NaN | NaN | 473.0 | 656.0 |
| 1015 | 1990 | 57 | Nevada | NaN | NaN | 527.0 | 928.0 |
| 16 | 2020 | 57 | Nevada | 408.0 | NaN | NaN | NaN |
| 936 | 1993 | 81 | SanMateo | 48.0 | NaN | NaN | NaN |
| 907 | 1994 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 877 | 1995 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 848 | 1996 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 819 | 1997 | 81 | SanMateo | 52.0 | NaN | NaN | NaN |
| 786 | 1998 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 754 | 1999 | 81 | SanMateo | 40.0 | NaN | NaN | NaN |
| 723 | 2000 | 81 | SanMateo | 45.0 | NaN | NaN | NaN |
| 692 | 2001 | 81 | SanMateo | 44.0 | NaN | NaN | NaN |
| 660 | 2002 | 81 | SanMateo | 64.0 | NaN | NaN | NaN |
| 626 | 2003 | 81 | SanMateo | 60.0 | NaN | NaN | NaN |
| 592 | 2004 | 81 | SanMateo | NaN | NaN | NaN | NaN |
| 558 | 2005 | 81 | SanMateo | 88.0 | NaN | NaN | NaN |
| 524 | 2006 | 81 | SanMateo | 89.0 | NaN | NaN | NaN |
| 490 | 2007 | 81 | SanMateo | 98.0 | NaN | NaN | NaN |
| 456 | 2008 | 81 | SanMateo | 96.0 | NaN | NaN | NaN |
| 421 | 2009 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 386 | 2010 | 81 | SanMateo | 137.0 | NaN | NaN | NaN |
| 350 | 2011 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 314 | 2012 | 81 | SanMateo | 131.0 | NaN | NaN | NaN |
| 278 | 2013 | 81 | SanMateo | 153.0 | NaN | NaN | NaN |
| 243 | 2014 | 81 | SanMateo | 154.0 | NaN | NaN | NaN |
| 209 | 2015 | 81 | SanMateo | 165.0 | NaN | NaN | NaN |
| 173 | 2016 | 81 | SanMateo | 152.0 | NaN | NaN | NaN |
| 137 | 2017 | 81 | SanMateo | 164.0 | NaN | NaN | NaN |
| 100 | 2018 | 81 | SanMateo | 126.0 | NaN | NaN | NaN |
| 62 | 2019 | 81 | SanMateo | 181.0 | NaN | NaN | NaN |
| 25 | 2020 | 81 | SanMateo | 176.0 | NaN | NaN | NaN |
| 793 | 1998 | 105 | Trinity | 85.0 | NaN | NaN | NaN |
| 760 | 1999 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 728 | 2000 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 698 | 2001 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 666 | 2002 | 105 | Trinity | NaN | NaN | 140.0 | 1204.0 |
| 632 | 2003 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 598 | 2004 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 564 | 2005 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 530 | 2006 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 496 | 2007 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 462 | 2008 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 428 | 2009 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
data[data.Production.isna()].sort_values(['County','Year'])
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 16 | 2020 | 57 | Nevada | 408.0 | NaN | NaN | NaN |
| 936 | 1993 | 81 | SanMateo | 48.0 | NaN | NaN | NaN |
| 907 | 1994 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 877 | 1995 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 848 | 1996 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 819 | 1997 | 81 | SanMateo | 52.0 | NaN | NaN | NaN |
| 786 | 1998 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 754 | 1999 | 81 | SanMateo | 40.0 | NaN | NaN | NaN |
| 723 | 2000 | 81 | SanMateo | 45.0 | NaN | NaN | NaN |
| 692 | 2001 | 81 | SanMateo | 44.0 | NaN | NaN | NaN |
| 660 | 2002 | 81 | SanMateo | 64.0 | NaN | NaN | NaN |
| 626 | 2003 | 81 | SanMateo | 60.0 | NaN | NaN | NaN |
| 592 | 2004 | 81 | SanMateo | NaN | NaN | NaN | NaN |
| 558 | 2005 | 81 | SanMateo | 88.0 | NaN | NaN | NaN |
| 524 | 2006 | 81 | SanMateo | 89.0 | NaN | NaN | NaN |
| 490 | 2007 | 81 | SanMateo | 98.0 | NaN | NaN | NaN |
| 456 | 2008 | 81 | SanMateo | 96.0 | NaN | NaN | NaN |
| 421 | 2009 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 386 | 2010 | 81 | SanMateo | 137.0 | NaN | NaN | NaN |
| 350 | 2011 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 314 | 2012 | 81 | SanMateo | 131.0 | NaN | NaN | NaN |
| 278 | 2013 | 81 | SanMateo | 153.0 | NaN | NaN | NaN |
| 243 | 2014 | 81 | SanMateo | 154.0 | NaN | NaN | NaN |
| 209 | 2015 | 81 | SanMateo | 165.0 | NaN | NaN | NaN |
| 173 | 2016 | 81 | SanMateo | 152.0 | NaN | NaN | NaN |
| 137 | 2017 | 81 | SanMateo | 164.0 | NaN | NaN | NaN |
| 100 | 2018 | 81 | SanMateo | 126.0 | NaN | NaN | NaN |
| 62 | 2019 | 81 | SanMateo | 181.0 | NaN | NaN | NaN |
| 25 | 2020 | 81 | SanMateo | 176.0 | NaN | NaN | NaN |
| 793 | 1998 | 105 | Trinity | 85.0 | NaN | NaN | NaN |
| 632 | 2003 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 598 | 2004 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 564 | 2005 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 530 | 2006 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 496 | 2007 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 462 | 2008 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 428 | 2009 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
data[data.Price.isna()].sort_values(['County','Year'])
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 16 | 2020 | 57 | Nevada | 408.0 | NaN | NaN | NaN |
| 936 | 1993 | 81 | SanMateo | 48.0 | NaN | NaN | NaN |
| 907 | 1994 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 877 | 1995 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 848 | 1996 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 819 | 1997 | 81 | SanMateo | 52.0 | NaN | NaN | NaN |
| 786 | 1998 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 754 | 1999 | 81 | SanMateo | 40.0 | NaN | NaN | NaN |
| 723 | 2000 | 81 | SanMateo | 45.0 | NaN | NaN | NaN |
| 692 | 2001 | 81 | SanMateo | 44.0 | NaN | NaN | NaN |
| 660 | 2002 | 81 | SanMateo | 64.0 | NaN | NaN | NaN |
| 626 | 2003 | 81 | SanMateo | 60.0 | NaN | NaN | NaN |
| 592 | 2004 | 81 | SanMateo | NaN | NaN | NaN | NaN |
| 558 | 2005 | 81 | SanMateo | 88.0 | NaN | NaN | NaN |
| 524 | 2006 | 81 | SanMateo | 89.0 | NaN | NaN | NaN |
| 490 | 2007 | 81 | SanMateo | 98.0 | NaN | NaN | NaN |
| 456 | 2008 | 81 | SanMateo | 96.0 | NaN | NaN | NaN |
| 421 | 2009 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 386 | 2010 | 81 | SanMateo | 137.0 | NaN | NaN | NaN |
| 350 | 2011 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 314 | 2012 | 81 | SanMateo | 131.0 | NaN | NaN | NaN |
| 278 | 2013 | 81 | SanMateo | 153.0 | NaN | NaN | NaN |
| 243 | 2014 | 81 | SanMateo | 154.0 | NaN | NaN | NaN |
| 209 | 2015 | 81 | SanMateo | 165.0 | NaN | NaN | NaN |
| 173 | 2016 | 81 | SanMateo | 152.0 | NaN | NaN | NaN |
| 137 | 2017 | 81 | SanMateo | 164.0 | NaN | NaN | NaN |
| 100 | 2018 | 81 | SanMateo | 126.0 | NaN | NaN | NaN |
| 62 | 2019 | 81 | SanMateo | 181.0 | NaN | NaN | NaN |
| 25 | 2020 | 81 | SanMateo | 176.0 | NaN | NaN | NaN |
| 793 | 1998 | 105 | Trinity | 85.0 | NaN | NaN | NaN |
| 632 | 2003 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 598 | 2004 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 564 | 2005 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 530 | 2006 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 496 | 2007 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 462 | 2008 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 428 | 2009 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
data[data['County'] == 'Nevada']
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 16 | 2020 | 57 | Nevada | 408.0 | NaN | NaN | NaN |
| 53 | 2019 | 57 | Nevada | 416.0 | 3.92 | 1630.0 | 1564.42 |
| 91 | 2018 | 57 | Nevada | 416.0 | 4.52 | 1880.0 | 1284.57 |
| 128 | 2017 | 57 | Nevada | 416.0 | 3.65 | 1520.0 | 1287.50 |
| 164 | 2016 | 57 | Nevada | 417.0 | 3.33 | 1390.0 | 1386.33 |
| 200 | 2015 | 57 | Nevada | 352.0 | 1.87 | 659.0 | 1349.01 |
| 234 | 2014 | 57 | Nevada | 342.0 | 3.86 | 1320.0 | 1341.67 |
| 269 | 2013 | 57 | Nevada | 424.0 | 2.92 | 1240.0 | 1521.77 |
| 305 | 2012 | 57 | Nevada | 312.0 | 2.75 | 858.0 | 1393.94 |
| 341 | 2011 | 57 | Nevada | 343.0 | 5.19 | 1780.0 | 1101.12 |
| 377 | 2010 | 57 | Nevada | 236.0 | 3.12 | 736.0 | 1402.17 |
| 412 | 2009 | 57 | Nevada | 248.0 | 10.44 | 2590.0 | 542.63 |
| 447 | 2008 | 57 | Nevada | 402.0 | 2.96 | 1189.0 | 1280.82 |
| 481 | 2007 | 57 | Nevada | 385.0 | 4.23 | 1629.0 | 1280.91 |
| 515 | 2006 | 57 | Nevada | 358.0 | 3.71 | 1329.0 | 1170.28 |
| 549 | 2005 | 57 | Nevada | 350.0 | 3.72 | 1302.0 | 1294.55 |
| 583 | 2004 | 57 | Nevada | 349.0 | 4.11 | 1434.0 | 1527.68 |
| 617 | 2003 | 57 | Nevada | 356.0 | 3.67 | 1307.0 | 1350.80 |
| 651 | 2002 | 57 | Nevada | 404.0 | 4.32 | 1746.0 | 1053.00 |
| 683 | 2001 | 57 | Nevada | 348.0 | 3.00 | 1043.0 | 1192.00 |
| 715 | 2000 | 57 | Nevada | 303.0 | 3.57 | 1082.0 | 1096.00 |
| 746 | 1999 | 57 | Nevada | 201.0 | 4.49 | 902.0 | 1031.00 |
| 778 | 1998 | 57 | Nevada | 201.0 | 2.72 | 546.0 | 1105.00 |
| 811 | 1997 | 57 | Nevada | 201.0 | 5.29 | 1063.0 | 1099.00 |
| 841 | 1996 | 57 | Nevada | 201.0 | 4.26 | 856.0 | 1068.00 |
| 869 | 1995 | 57 | Nevada | 201.0 | 2.46 | 495.0 | 1018.00 |
| 899 | 1994 | 57 | Nevada | 174.0 | 3.92 | 682.0 | 701.00 |
| 928 | 1993 | 57 | Nevada | 174.0 | 3.42 | 595.0 | 713.00 |
| 957 | 1992 | 57 | Nevada | 174.0 | 4.93 | 858.0 | 816.00 |
| 986 | 1991 | 57 | Nevada | 174.0 | 4.94 | 860.0 | 661.00 |
| 1015 | 1990 | 57 | Nevada | NaN | NaN | 527.0 | 928.00 |
| 1043 | 1989 | 57 | Nevada | NaN | NaN | 473.0 | 656.00 |
| 1072 | 1988 | 57 | Nevada | NaN | NaN | 223.0 | 502.00 |
| 1103 | 1987 | 57 | Nevada | NaN | NaN | 238.0 | 465.00 |
| 1134 | 1986 | 57 | Nevada | NaN | NaN | 125.0 | 435.00 |
| 1163 | 1985 | 57 | Nevada | NaN | NaN | 62.0 | 405.00 |
| 1193 | 1984 | 57 | Nevada | NaN | NaN | 46.0 | 496.00 |
| 1225 | 1983 | 57 | Nevada | NaN | NaN | 23.0 | 500.00 |
data[data['County'] == 'SanMateo']
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 25 | 2020 | 81 | SanMateo | 176.0 | NaN | NaN | NaN |
| 62 | 2019 | 81 | SanMateo | 181.0 | NaN | NaN | NaN |
| 100 | 2018 | 81 | SanMateo | 126.0 | NaN | NaN | NaN |
| 137 | 2017 | 81 | SanMateo | 164.0 | NaN | NaN | NaN |
| 173 | 2016 | 81 | SanMateo | 152.0 | NaN | NaN | NaN |
| 209 | 2015 | 81 | SanMateo | 165.0 | NaN | NaN | NaN |
| 243 | 2014 | 81 | SanMateo | 154.0 | NaN | NaN | NaN |
| 278 | 2013 | 81 | SanMateo | 153.0 | NaN | NaN | NaN |
| 314 | 2012 | 81 | SanMateo | 131.0 | NaN | NaN | NaN |
| 350 | 2011 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 386 | 2010 | 81 | SanMateo | 137.0 | NaN | NaN | NaN |
| 421 | 2009 | 81 | SanMateo | 135.0 | NaN | NaN | NaN |
| 456 | 2008 | 81 | SanMateo | 96.0 | NaN | NaN | NaN |
| 490 | 2007 | 81 | SanMateo | 98.0 | NaN | NaN | NaN |
| 524 | 2006 | 81 | SanMateo | 89.0 | NaN | NaN | NaN |
| 558 | 2005 | 81 | SanMateo | 88.0 | NaN | NaN | NaN |
| 592 | 2004 | 81 | SanMateo | NaN | NaN | NaN | NaN |
| 626 | 2003 | 81 | SanMateo | 60.0 | NaN | NaN | NaN |
| 660 | 2002 | 81 | SanMateo | 64.0 | NaN | NaN | NaN |
| 692 | 2001 | 81 | SanMateo | 44.0 | NaN | NaN | NaN |
| 723 | 2000 | 81 | SanMateo | 45.0 | NaN | NaN | NaN |
| 754 | 1999 | 81 | SanMateo | 40.0 | NaN | NaN | NaN |
| 786 | 1998 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 819 | 1997 | 81 | SanMateo | 52.0 | NaN | NaN | NaN |
| 848 | 1996 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 877 | 1995 | 81 | SanMateo | 56.0 | NaN | NaN | NaN |
| 907 | 1994 | 81 | SanMateo | 54.0 | NaN | NaN | NaN |
| 936 | 1993 | 81 | SanMateo | 48.0 | NaN | NaN | NaN |
data[data['County'] == 'Trinity']
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 71 | 2019 | 105 | Trinity | 44.0 | 1.93 | 85.0 | 1729.41 |
| 108 | 2018 | 105 | Trinity | 44.0 | 1.93 | 85.0 | 1729.41 |
| 144 | 2017 | 105 | Trinity | 44.0 | 1.93 | 85.0 | 1729.41 |
| 180 | 2016 | 105 | Trinity | 44.0 | 1.93 | 85.0 | 1729.41 |
| 285 | 2013 | 105 | Trinity | 114.0 | 2.33 | 266.0 | 1214.29 |
| 321 | 2012 | 105 | Trinity | 114.0 | 2.33 | 266.0 | 1214.29 |
| 357 | 2011 | 105 | Trinity | 114.0 | 2.33 | 266.0 | 1214.29 |
| 393 | 2010 | 105 | Trinity | 114.0 | 2.33 | 266.0 | 1214.29 |
| 428 | 2009 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 462 | 2008 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 496 | 2007 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 530 | 2006 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 564 | 2005 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 598 | 2004 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 632 | 2003 | 105 | Trinity | 114.0 | NaN | NaN | NaN |
| 666 | 2002 | 105 | Trinity | NaN | NaN | 140.0 | 1204.00 |
| 698 | 2001 | 105 | Trinity | NaN | NaN | 140.0 | 1204.00 |
| 728 | 2000 | 105 | Trinity | NaN | NaN | 140.0 | 1204.00 |
| 760 | 1999 | 105 | Trinity | NaN | NaN | 140.0 | 1204.00 |
| 793 | 1998 | 105 | Trinity | 85.0 | NaN | NaN | NaN |
data.dropna(inplace=True)
data.isnull().sum()
Year 0 CountyCode 0 County 0 HarvestedAcres 0 Yield 0 Production 0 Price 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1266 entries, 0 to 1314 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 1266 non-null int64 1 CountyCode 1266 non-null int64 2 County 1266 non-null object 3 HarvestedAcres 1266 non-null float64 4 Yield 1266 non-null float64 5 Production 1266 non-null float64 6 Price 1266 non-null float64 dtypes: float64(4), int64(2), object(1) memory usage: 79.1+ KB
Discretizing the data into classes.
data
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 0 | 2020 | 1 | Alameda | 2530.0 | 5.14 | 13000.0 | 1497.69 |
| 1 | 2020 | 5 | Amador | 5360.0 | 2.31 | 12400.0 | 1318.31 |
| 2 | 2020 | 9 | Calaveras | 579.0 | 3.06 | 1770.0 | 1325.99 |
| 3 | 2020 | 11 | Colusa | 747.0 | 6.02 | 4500.0 | 684.67 |
| 4 | 2020 | 13 | ContraCosta | 1940.0 | 4.69 | 9090.0 | 751.27 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1310 | 1980 | 95 | Solano | 1138.0 | 3.99 | 4544.0 | 315.00 |
| 1311 | 1980 | 97 | Sonoma | 23639.0 | 3.34 | 78941.0 | 506.00 |
| 1312 | 1980 | 99 | Stanislaus | 17950.0 | 8.80 | 157900.0 | 183.00 |
| 1313 | 1980 | 107 | Tulare | 15159.0 | 8.88 | 134600.0 | 170.00 |
| 1314 | 1980 | 113 | Yolo | 566.0 | 8.70 | 4924.0 | 274.00 |
1266 rows × 7 columns
price_data = data.Price
# len(price_data)
price_data.size
1266
price_data.sort_values(ascending=True)
1156 74.00
1155 90.00
1158 94.00
1188 95.00
1185 97.00
...
370 5125.60
127 5287.55
90 5614.05
52 5862.26
522 36342.07
Name: Price, Length: 1266, dtype: float64
data.sort_values('Price')
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|---|---|
| 1156 | 1985 | 31 | Kings | 1117.0 | 11.90 | 13292.0 | 74.00 |
| 1155 | 1985 | 29 | Kern | 33255.0 | 8.48 | 282000.0 | 90.00 |
| 1158 | 1985 | 39 | Madera | 40100.0 | 7.54 | 302354.0 | 94.00 |
| 1188 | 1984 | 39 | Madera | 36010.0 | 6.87 | 247389.0 | 95.00 |
| 1185 | 1984 | 29 | Kern | 34861.0 | 6.95 | 242160.0 | 97.00 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 370 | 2010 | 41 | Marin | 186.0 | 1.11 | 207.0 | 5125.60 |
| 127 | 2017 | 55 | Napa | 43600.0 | 3.26 | 142000.0 | 5287.55 |
| 90 | 2018 | 55 | Napa | 43400.0 | 4.26 | 185000.0 | 5614.05 |
| 52 | 2019 | 55 | Napa | 44200.0 | 3.62 | 160000.0 | 5862.26 |
| 522 | 2006 | 77 | SanJoaquin | 92501.0 | 0.06 | 5610.0 | 36342.07 |
1266 rows × 7 columns
data['Price'].value_counts()
400.00 9
193.00 5
249.00 5
170.00 5
1263.16 5
..
1046.00 1
1139.48 1
253.16 1
273.03 1
274.00 1
Name: Price, Length: 1057, dtype: int64
data['Price'].value_counts().sort_index()
74.00 1
90.00 1
94.00 1
95.00 1
97.00 1
..
5125.60 1
5287.55 1
5614.05 1
5862.26 1
36342.07 1
Name: Price, Length: 1057, dtype: int64
plt.figure(figsize=(200, 6))
price_data.value_counts().plot(kind = 'bar', title = 'Counts')
<AxesSubplot: title={'center': 'Counts'}>
data['Price_Classification'] = pd.cut(x=data['Price'], bins=[0, 250, 1000, 50000], labels=[0, 1, 2])
data['Price_Categories'] = pd.cut(x=data['Price'], bins=[0, 250, 1000, 50000], labels=["Low", "Medium", "High"])
data.Price_Categories.size
1266
data.Price_Categories.value_counts()
Medium 557 High 496 Low 213 Name: Price_Categories, dtype: int64
data.Price_Categories.value_counts().plot(kind='bar', title='Price Categories')
plt.xticks(rotation=0)
# plt.grid(True)
(array([0, 1, 2]), [Text(0, 0, 'Medium'), Text(1, 0, 'High'), Text(2, 0, 'Low')])
data.Price_Categories.value_counts().plot(kind='pie', title='Price Categories')
<AxesSubplot: title={'center': 'Price Categories'}, ylabel='Price_Categories'>
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1266 entries, 0 to 1314 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 1266 non-null int64 1 CountyCode 1266 non-null int64 2 County 1266 non-null object 3 HarvestedAcres 1266 non-null float64 4 Yield 1266 non-null float64 5 Production 1266 non-null float64 6 Price 1266 non-null float64 7 Price_Classification 1266 non-null category 8 Price_Categories 1266 non-null category dtypes: category(2), float64(4), int64(2), object(1) memory usage: 81.9+ KB
Year (From 1980 to 2020)
CountyCode
County
HarvestedAcres
Yield (Tons/Acres)
Production (Tons)
Price (Dollar/Ton)
$Formula: $
$Yield = \frac{Production}{HarvestedAcres}$
plt.figure(figsize=(12, 6))
data.boxplot()
<AxesSubplot: >
plt.figure(figsize=(12, 6))
sns.boxplot(data)
<AxesSubplot: >
data.plot(kind='box', subplots=True, layout=(3, 3), figsize=(20, 18))
Year AxesSubplot(0.125,0.653529;0.227941x0.226471) CountyCode AxesSubplot(0.398529,0.653529;0.227941x0.226471) HarvestedAcres AxesSubplot(0.672059,0.653529;0.227941x0.226471) Yield AxesSubplot(0.125,0.381765;0.227941x0.226471) Production AxesSubplot(0.398529,0.381765;0.227941x0.226471) Price AxesSubplot(0.672059,0.381765;0.227941x0.226471) dtype: object
sns.heatmap(data.corr(), annot=True)
plt.tight_layout()
sns.pairplot(data, hue='Price_Categories', palette='tab10')
<seaborn.axisgrid.PairGrid at 0x1a7563c9c90>
year_group_df = data.groupby(['Year']).mean()
year_group_df
| CountyCode | HarvestedAcres | Yield | Production | Price | |
|---|---|---|---|---|---|
| Year | |||||
| 1980 | 58.363636 | 11648.181818 | 5.423182 | 81257.045455 | 311.181818 |
| 1981 | 58.363636 | 11361.045455 | 5.122727 | 74517.454545 | 365.909091 |
| 1982 | 56.724138 | 10141.793103 | 5.392759 | 72930.310345 | 333.379310 |
| 1983 | 56.290323 | 10020.483871 | 4.612903 | 59887.290323 | 328.935484 |
| 1984 | 57.466667 | 10810.733333 | 4.643000 | 64995.700000 | 314.333333 |
| 1985 | 58.142857 | 11566.142857 | 5.326786 | 78133.321429 | 294.035714 |
| 1986 | 56.448276 | 11338.310345 | 5.053793 | 70793.103448 | 320.862069 |
| 1987 | 57.466667 | 10421.900000 | 4.572000 | 64398.266667 | 363.933333 |
| 1988 | 60.500000 | 11734.607143 | 5.097857 | 82406.928571 | 461.928571 |
| 1989 | 60.500000 | 12157.607143 | 5.382143 | 82915.714286 | 543.321429 |
| 1990 | 59.857143 | 11866.464286 | 5.058214 | 82378.000000 | 532.928571 |
| 1991 | 59.758621 | 11439.965517 | 5.310345 | 79712.275862 | 551.827586 |
| 1992 | 59.758621 | 14532.551724 | 5.518276 | 115578.896552 | 585.103448 |
| 1993 | 58.928571 | 13971.035714 | 5.612857 | 110075.071429 | 572.750000 |
| 1994 | 57.482759 | 13144.931034 | 5.321724 | 93701.448276 | 581.448276 |
| 1995 | 57.482759 | 14098.068966 | 5.145862 | 106062.379310 | 668.551724 |
| 1996 | 55.666667 | 15990.074074 | 5.302963 | 112124.962963 | 812.814815 |
| 1997 | 57.413793 | 16439.206897 | 6.615172 | 143187.896552 | 893.068966 |
| 1998 | 60.161290 | 15704.064516 | 5.067419 | 109923.645161 | 940.096774 |
| 1999 | 58.466667 | 17762.833333 | 4.806333 | 107824.433333 | 1035.866667 |
| 2000 | 59.500000 | 16633.214286 | 5.092143 | 132022.071429 | 1039.821429 |
| 2001 | 60.333333 | 16330.033333 | 4.642333 | 107666.200000 | 1023.766667 |
| 2002 | 58.375000 | 17533.781250 | 4.671875 | 121630.875000 | 977.625000 |
| 2003 | 58.375000 | 17039.093750 | 4.500312 | 107848.437500 | 972.457188 |
| 2004 | 58.375000 | 18094.218750 | 4.444375 | 113704.875000 | 985.163750 |
| 2005 | 58.375000 | 18063.125000 | 5.443750 | 139594.375000 | 996.616250 |
| 2006 | 58.375000 | 17290.531250 | 4.634062 | 92065.218750 | 2148.246562 |
| 2007 | 58.375000 | 17021.812500 | 4.664688 | 115152.156250 | 1156.425625 |
| 2008 | 58.375000 | 17666.000000 | 4.590625 | 121311.625000 | 1268.173125 |
| 2009 | 59.303030 | 16797.393939 | 5.448485 | 121407.212121 | 1118.793636 |
| 2010 | 60.371429 | 15781.371429 | 5.030857 | 113799.657143 | 1102.573429 |
| 2011 | 59.000000 | 15903.027778 | 5.267222 | 106007.888889 | 1135.948889 |
| 2012 | 57.400000 | 17577.571429 | 6.063143 | 130217.885714 | 1162.566857 |
| 2013 | 57.400000 | 15539.371429 | 5.696286 | 119421.685714 | 1229.459143 |
| 2014 | 56.151515 | 16944.030303 | 5.313030 | 117627.787879 | 1239.334848 |
| 2015 | 54.882353 | 16403.617647 | 4.743529 | 103503.529412 | 1284.156765 |
| 2016 | 56.314286 | 16588.028571 | 5.207714 | 111014.857143 | 1380.673429 |
| 2017 | 56.314286 | 16300.485714 | 5.417714 | 110198.771429 | 1397.224000 |
| 2018 | 57.611111 | 15638.500000 | 5.603056 | 113497.027778 | 1447.444722 |
| 2019 | 58.729730 | 15292.594595 | 5.691351 | 105145.243243 | 1467.862703 |
| 2020 | 57.882353 | 16282.823529 | 5.570294 | 101469.411765 | 1396.155294 |
year_group_df.index
Int64Index([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020],
dtype='int64', name='Year')
plt.figure(figsize=(12, 6))
sns.lineplot(x=data.groupby(['Year']).count().index, y=data.groupby(['Year']).count()['County'])
plt.ylabel(ylabel='Number of counties producing wine')
plt.title(label='Fluctuation in the number of wine-producing counties across time')
plt.show()
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Production'])
plt.title(label='Fluctuations in Production over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Production')
plt.title(label='Fluctuations in Production over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Production')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Production in Tons')
plt.show()
data.sort_values('Production').tail(5)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 293 | 2012 | 19 | Fresno | 86700.0 | 10.38 | 900000.0 | 385.48 | 1 | Medium |
| 329 | 2011 | 19 | Fresno | 68900.0 | 13.13 | 905000.0 | 335.50 | 1 | Medium |
| 640 | 2002 | 19 | Fresno | 85701.0 | 10.83 | 928100.0 | 141.00 | 0 | Low |
| 801 | 1997 | 19 | Fresno | 87220.0 | 11.22 | 978930.0 | 225.00 | 0 | Low |
| 436 | 2008 | 19 | Fresno | 69631.0 | 14.94 | 1040100.0 | 250.93 | 1 | Medium |
sns.barplot(x='Year', y='Production', data=data.sort_values('Production').tail(5))
plt.title(label='Top Five Wine Production Years')
Text(0.5, 1.0, 'Top Five Wine Production Years')
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['HarvestedAcres'])
plt.title(label='Fluctuations in Harvested Acres over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='HarvestedAcres')
plt.title(label='Fluctuations in Harvested Acres over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Harvested Area in Acres')
plt.show()
data.sort_values('HarvestedAcres').tail(5)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 171 | 2016 | 77 | SanJoaquin | 98000.0 | 7.31 | 716000.0 | 594.67 | 1 | Medium |
| 135 | 2017 | 77 | SanJoaquin | 98100.0 | 6.79 | 666000.0 | 593.91 | 1 | Medium |
| 241 | 2014 | 77 | SanJoaquin | 102000.0 | 8.00 | 816000.0 | 590.00 | 1 | Medium |
| 312 | 2012 | 77 | SanJoaquin | 109000.0 | 8.18 | 892000.0 | 605.72 | 1 | Medium |
| 736 | 1999 | 19 | Fresno | 128613.0 | 5.40 | 693910.0 | 237.00 | 0 | Low |
sns.barplot(x='Year', y='HarvestedAcres', data=data.sort_values('HarvestedAcres').tail(5))
plt.title(label='Top Five Grapes Harvest Years')
Text(0.5, 1.0, 'Top Five Grapes Harvest Years')
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Yield'])
plt.title(label='Fluctuations in Yield over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Yield')
plt.title(label='Fluctuations in Yield over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Yield in Tons per Acres')
plt.show()
data.sort_values('Yield').tail(5)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 145 | 2017 | 107 | Tulare | 9450.0 | 17.35 | 164000.0 | 308.00 | 1 | Medium |
| 109 | 2018 | 107 | Tulare | 8410.0 | 17.72 | 149000.0 | 314.00 | 1 | Medium |
| 72 | 2019 | 107 | Tulare | 6260.0 | 18.69 | 117000.0 | 290.00 | 1 | Medium |
| 338 | 2011 | 51 | Mono | 4.0 | 23.50 | 94.0 | 670.21 | 1 | Medium |
| 302 | 2012 | 51 | Mono | 3.0 | 25.00 | 75.0 | 653.33 | 1 | Medium |
sns.barplot(x='Year', y='Yield', data=data.sort_values('Yield').tail(5))
plt.title(label='Top Five Wine Yield Years')
Text(0.5, 1.0, 'Top Five Wine Yield Years')
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Price'])
plt.title(label='Fluctuations in Price over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Price')
plt.title(label='Fluctuations in Price over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price per Ton over Time')
plt.show()
plt.figure(figsize=(12, 6))
sns.countplot(x='Year', hue='Price_Categories', data=data)
plt.xticks(rotation=90)
plt.title(label='Price Categories over Time')
plt.show()
import plotly.express as px
fig = px.line(data, x='Year', y='Price', color='Price_Categories', symbol="Price_Categories")
fig.show()
data.sort_values('Price').tail(5)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 370 | 2010 | 41 | Marin | 186.0 | 1.11 | 207.0 | 5125.60 | 2 | High |
| 127 | 2017 | 55 | Napa | 43600.0 | 3.26 | 142000.0 | 5287.55 | 2 | High |
| 90 | 2018 | 55 | Napa | 43400.0 | 4.26 | 185000.0 | 5614.05 | 2 | High |
| 52 | 2019 | 55 | Napa | 44200.0 | 3.62 | 160000.0 | 5862.26 | 2 | High |
| 522 | 2006 | 77 | SanJoaquin | 92501.0 | 0.06 | 5610.0 | 36342.07 | 2 | High |
sns.barplot(x='Year', y='Price', data=data.sort_values('Price').tail(5))
plt.title(label='Top Five Wine Price Years')
Text(0.5, 1.0, 'Top Five Wine Price Years')
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Production')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Production over County')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Production')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Production over County')
plt.show()
data.sort_values('Production').tail(1)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 436 | 2008 | 19 | Fresno | 69631.0 | 14.94 | 1040100.0 | 250.93 | 1 | Medium |
sns.barplot(x='County', y='Production', data=data.sort_values('Production').tail(1))
plt.title(label='Top Wine Production County')
Text(0.5, 1.0, 'Top Wine Production County')
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in HarvestedAcres over County')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in HarvestedAcres over County')
plt.show()
data.sort_values('HarvestedAcres').tail(1)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 736 | 1999 | 19 | Fresno | 128613.0 | 5.4 | 693910.0 | 237.0 | 0 | Low |
sns.barplot(x='County', y='HarvestedAcres', data=data.sort_values('HarvestedAcres').tail(1))
plt.title(label='Top Grapes Harvest County')
Text(0.5, 1.0, 'Top Grapes Harvest County')
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Yield over County')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Yield over County')
plt.show()
data.sort_values('Yield').tail(1)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 302 | 2012 | 51 | Mono | 3.0 | 25.0 | 75.0 | 653.33 | 1 | Medium |
sns.barplot(x='County', y='Yield', data=data.sort_values('Yield').tail(1))
plt.title(label='Top Wine Yield County')
Text(0.5, 1.0, 'Top Wine Yield County')
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price over County')
plt.show()
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price over County')
plt.show()
plt.figure(figsize=(12, 6))
sns.countplot(x='County', hue='Price_Categories', data=data)
plt.xticks(rotation=90)
plt.title(label='Price Categories over County')
plt.show()
data.sort_values('Price').tail(1)
| Year | CountyCode | County | HarvestedAcres | Yield | Production | Price | Price_Classification | Price_Categories | |
|---|---|---|---|---|---|---|---|---|---|
| 522 | 2006 | 77 | SanJoaquin | 92501.0 | 0.06 | 5610.0 | 36342.07 | 2 | High |
sns.barplot(x='County', y='Price', data=data.sort_values('Price').tail(1))
plt.title(label='Top Wine Price County')
Text(0.5, 1.0, 'Top Wine Price County')
fig = category_scatter(x='Yield', y='Production', label_col='Price_Categories', data=data, legend_loc='upper left')
plt.xlabel('Yield')
plt.ylabel('Production')
plt.title(label='Price Categories over Yield and Production')
# plt.grid(True)
plt.show()
plt.figure(figsize=(12, 6))
sns.scatterplot(data, x='Yield', y='Production', hue="Price_Categories")
plt.title(label='Price Categories over Yield and Production')
Text(0.5, 1.0, 'Price Categories over Yield and Production')
sns.scatterplot(data, x='CountyCode', y='Year', hue="Price_Categories")
plt.title(label='Price Categories over CountyCode and Year')
Text(0.5, 1.0, 'Price Categories over CountyCode and Year')
plt.figure(figsize=(12, 6))
sns.lineplot(data, x="CountyCode", y="Year", hue="Price_Categories")
plt.title(label='Price Categories over CountyCode and Year')
Text(0.5, 1.0, 'Price Categories over CountyCode and Year')
Setting up the target variable.
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1266 entries, 0 to 1314 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 1266 non-null int64 1 CountyCode 1266 non-null int64 2 County 1266 non-null object 3 HarvestedAcres 1266 non-null float64 4 Yield 1266 non-null float64 5 Production 1266 non-null float64 6 Price 1266 non-null float64 7 Price_Classification 1266 non-null category 8 Price_Categories 1266 non-null category dtypes: category(2), float64(4), int64(2), object(1) memory usage: 81.9+ KB
features = ['Year', 'CountyCode', 'HarvestedAcres', 'Yield', 'Production']
x = data[features]
y = data.Price_Classification
x
| Year | CountyCode | HarvestedAcres | Yield | Production | |
|---|---|---|---|---|---|
| 0 | 2020 | 1 | 2530.0 | 5.14 | 13000.0 |
| 1 | 2020 | 5 | 5360.0 | 2.31 | 12400.0 |
| 2 | 2020 | 9 | 579.0 | 3.06 | 1770.0 |
| 3 | 2020 | 11 | 747.0 | 6.02 | 4500.0 |
| 4 | 2020 | 13 | 1940.0 | 4.69 | 9090.0 |
| ... | ... | ... | ... | ... | ... |
| 1310 | 1980 | 95 | 1138.0 | 3.99 | 4544.0 |
| 1311 | 1980 | 97 | 23639.0 | 3.34 | 78941.0 |
| 1312 | 1980 | 99 | 17950.0 | 8.80 | 157900.0 |
| 1313 | 1980 | 107 | 15159.0 | 8.88 | 134600.0 |
| 1314 | 1980 | 113 | 566.0 | 8.70 | 4924.0 |
1266 rows × 5 columns
x.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1266 entries, 0 to 1314 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 1266 non-null int64 1 CountyCode 1266 non-null int64 2 HarvestedAcres 1266 non-null float64 3 Yield 1266 non-null float64 4 Production 1266 non-null float64 dtypes: float64(3), int64(2) memory usage: 59.3 KB
y = y.astype('int64')
y.info()
<class 'pandas.core.series.Series'> Int64Index: 1266 entries, 0 to 1314 Series name: Price_Classification Non-Null Count Dtype -------------- ----- 1266 non-null int64 dtypes: int64(1) memory usage: 19.8 KB
y.value_counts()
1 557 2 496 0 213 Name: Price_Classification, dtype: int64
y.value_counts().plot(kind = 'bar', title = 'Price Classification Counts')
plt.xticks(rotation=0)
(array([0, 1, 2]), [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '0')])
sns.heatmap(x.corr(), annot=True)
plt.tight_layout()
scaler = MinMaxScaler()
# scaler.fit(x)
# scaler.transform(x)
x = scaler.fit_transform(x)
x
array([[1. , 0. , 0.01964855, 0.20368885, 0.01246272],
[1. , 0.03508772, 0.04165306, 0.09021652, 0.01188583],
[1. , 0.07017544, 0.00447866, 0.12028869, 0.00166529],
...,
[0. , 0.85964912, 0.13954591, 0.35044106, 0.15178134],
[0. , 0.92982456, 0.11784465, 0.35364876, 0.12937883],
[0. , 0.98245614, 0.00437758, 0.34643144, 0.0046978 ]])
sns.boxplot(x)
<AxesSubplot: >
Split dataset into 80% for training and 20% for testing.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
print("Shape of original dataset:", data.shape)
print("Shape of input training set:", x_train.shape)
print("Shape of output training set:", y_train.shape)
print("Shape of input testing set:", x_test.shape)
print("Shape of output testing set:", y_test.shape)
Shape of original dataset: (1266, 9) Shape of input training set: (1012, 5) Shape of output training set: (1012,) Shape of input testing set: (254, 5) Shape of output testing set: (254,)
y_test.size
254
y_test.value_counts()
2 110 1 109 0 35 Name: Price_Classification, dtype: int64
Performance Functions for Models
true_class_names = ["True Low", "True Medium", "True High"]
predicted_class_names = ["Predicted Low", "Predicted Medium", "Predicted High"]
def Confusion_Matrix_Plotter(cm, dtype):
cm_df = pd.DataFrame(cm, index = true_class_names, columns = predicted_class_names)
if dtype == 1:
sns.heatmap(cm_df, annot=True, fmt="d")
plt.title('Confusion Matrix')
else:
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix Percentage')
plt.tight_layout()
def Compute_Error(cm):
n11, n12, n13, n21, n22, n23, n31, n32, n33 = cm.ravel()
TP_C1 = n11
TN_C1 = n22 + n33
FP_C1 = n21 + n31
FN_C1 = n12 + n13
Type1_Error_C1 = FP_C1
Type2_Error_C1 = FN_C1
print("Type1_Error_LowPrice:", Type1_Error_C1)
print("Type2_Error_LowPrice:", Type2_Error_C1)
TP_C2 = n22
TN_C2 = n11 + n33
FP_C2 = n12 + n32
FN_C2 = n21 + n23
Type1_Error_C2 = FP_C2
Type2_Error_C2 = FN_C2
print("Type1_Error_MediumPrice:", Type1_Error_C2)
print("Type2_Error_MediumPrice:", Type2_Error_C2)
TP_C3 = n33
TN_C3 = n11 + n22
FP_C3 = n13 + n23
FN_C3 = n31 + n32
Type1_Error_C3 = FP_C3
Type2_Error_C3 = FN_C3
print("Type1_Error_HighPrice:", Type1_Error_C3)
print("Type2_Error_HighPrice:", Type2_Error_C3)
return Type1_Error_C1, Type2_Error_C1, Type1_Error_C2, Type2_Error_C2, Type1_Error_C3, Type2_Error_C3
def Compute_Sensitivity(TP, FN):
sensitivity_test = (TP / (TP + FN))
return sensitivity_test
def Compute_Specificity(TN, FP):
specificity_test = (TN / (FP + TN))
return specificity_test
nb = GaussianNB()
nb.fit(x_train, y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
# Model Scores on training and test set
print("Training Set score:", nb.score(x_train, y_train))
print("Test Set score:", nb.score(x_test, y_test))
Training Set score: 0.708498023715415 Test Set score: 0.6929133858267716
# Prediction on Testing Data
y_pred_nb = nb.predict(x_test)
nb_accuracy = metrics.accuracy_score(y_test, y_pred_nb)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_nb))
Accuracy: 0.6929133858267716
# Prediction on Training Data
y_pred2_nb = nb.predict(x_train)
nb_taccuracy = metrics.accuracy_score(y_train, y_pred2_nb)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_nb))
Accuracy: 0.708498023715415
confusion_matrix_nb = metrics.confusion_matrix(y_test, y_pred_nb)
confusion_matrix_nb
array([[18, 15, 2],
[12, 64, 33],
[ 0, 16, 94]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_nb, 1)
confusion_matrix_nb_percent = confusion_matrix_nb.astype('float') / confusion_matrix_nb.sum(axis=1)[:, np.newaxis]
confusion_matrix_nb_percent
array([[0.51428571, 0.42857143, 0.05714286],
[0.11009174, 0.58715596, 0.30275229],
[0. , 0.14545455, 0.85454545]])
Confusion_Matrix_Plotter(confusion_matrix_nb_percent, 0)
print(classification_report(y_test, y_pred_nb, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.60 0.51 0.55 35
Medium Price 0.67 0.59 0.63 109
High Price 0.73 0.85 0.79 110
accuracy 0.69 254
macro avg 0.67 0.65 0.66 254
weighted avg 0.69 0.69 0.69 254
nb_t1_l, nb_t2_l, nb_t1_m, nb_t2_m, nb_t1_h, nb_t2_h = Compute_Error(confusion_matrix_nb)
Type1_Error_LowPrice: 12 Type2_Error_LowPrice: 17 Type1_Error_MediumPrice: 31 Type2_Error_MediumPrice: 45 Type1_Error_HighPrice: 35 Type2_Error_HighPrice: 16
nb_pl, nb_pm, nb_ph = precision_score(y_test, y_pred_nb, average=None)
nb_rl, nb_rm, nb_rh = recall_score(y_test, y_pred_nb, average=None)
nb_fl, nb_fm, nb_fh = f1_score(y_test, y_pred_nb, average=None)
cv_nb = cross_val_score(nb, x_train, y_train, cv = 10, scoring='accuracy')
cv_nb
array([0.68627451, 0.65686275, 0.69306931, 0.79207921, 0.68316832,
0.62376238, 0.78217822, 0.73267327, 0.7029703 , 0.67326733])
cv_nb_m = cv_nb.mean()
print("Cross Validation Score:", cv_nb_m)
Cross Validation Score: 0.7026305571733643
plot_learning_curves(x_train, y_train, x_test, y_test, nb)
plt.show()
svc = SVC()
svc.fit(x_train, y_train)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
# Model Scores on training and test set
print("Training Set score:", svc.score(x_train, y_train))
print("Test Set score:", svc.score(x_test, y_test))
Training Set score: 0.8320158102766798 Test Set score: 0.7992125984251969
# Prediction on Testing Data
y_pred_svc = svc.predict(x_test)
svc_accuracy = metrics.accuracy_score(y_test, y_pred_svc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svc))
Accuracy: 0.7992125984251969
# Prediction on Training Data
y_pred2_svc = svc.predict(x_train)
svc_taccuracy = metrics.accuracy_score(y_train, y_pred2_svc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_svc))
Accuracy: 0.8320158102766798
confusion_matrix_svc = metrics.confusion_matrix(y_test, y_pred_svc)
confusion_matrix_svc
array([[24, 10, 1],
[ 8, 84, 17],
[ 0, 15, 95]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_svc, 1)
confusion_matrix_svc_percent = confusion_matrix_svc.astype('float') / confusion_matrix_svc.sum(axis=1)[:, np.newaxis]
confusion_matrix_svc_percent
array([[0.68571429, 0.28571429, 0.02857143],
[0.0733945 , 0.7706422 , 0.1559633 ],
[0. , 0.13636364, 0.86363636]])
Confusion_Matrix_Plotter(confusion_matrix_svc_percent, 0)
print(classification_report(y_test, y_pred_svc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.75 0.69 0.72 35
Medium Price 0.77 0.77 0.77 109
High Price 0.84 0.86 0.85 110
accuracy 0.80 254
macro avg 0.79 0.77 0.78 254
weighted avg 0.80 0.80 0.80 254
svc_t1_l, svc_t2_l, svc_t1_m, svc_t2_m, svc_t1_h, svc_t2_h = Compute_Error(confusion_matrix_svc)
Type1_Error_LowPrice: 8 Type2_Error_LowPrice: 11 Type1_Error_MediumPrice: 25 Type2_Error_MediumPrice: 25 Type1_Error_HighPrice: 18 Type2_Error_HighPrice: 15
svc_pl, svc_pm, svc_ph = precision_score(y_test, y_pred_svc, average=None)
svc_rl, svc_rm, svc_rh = recall_score(y_test, y_pred_svc, average=None)
svc_fl, svc_fm, svc_fh = f1_score(y_test, y_pred_svc, average=None)
cv_svc = cross_val_score(svc, x_train, y_train, cv = 10, scoring='accuracy')
cv_svc
array([0.83333333, 0.78431373, 0.77227723, 0.88118812, 0.79207921,
0.81188119, 0.84158416, 0.87128713, 0.79207921, 0.83168317])
cv_svc_m = cv_svc.mean()
print("Cross Validation Score:", cv_svc_m)
Cross Validation Score: 0.8211706464764124
plot_learning_curves(x_train, y_train, x_test, y_test, svc)
plt.show()
Applying GridSearchCV.
svc.get_params()
{'C': 1.0,
'break_ties': False,
'cache_size': 200,
'class_weight': None,
'coef0': 0.0,
'decision_function_shape': 'ovr',
'degree': 3,
'gamma': 'scale',
'kernel': 'rbf',
'max_iter': -1,
'probability': False,
'random_state': None,
'shrinking': True,
'tol': 0.001,
'verbose': False}
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
'degree': [1, 2, 3, 4, 5], 'gamma': [1, 0.1, 0.01, 0.001]}
param_grid = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'sigmoid'],
'degree': [1, 2, 3], 'gamma': [1, 0.1, 0.01]}
svc_gscv = GridSearchCV(SVC(), param_grid, scoring = 'accuracy', cv = 10, refit=True, verbose=1)
svc_gscv.fit(x_train, y_train)
Fitting 10 folds for each of 54 candidates, totalling 540 fits
GridSearchCV(cv=10, estimator=SVC(),
param_grid={'C': [0.1, 1, 10], 'degree': [1, 2, 3],
'gamma': [1, 0.1, 0.01],
'kernel': ['rbf', 'sigmoid']},
scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=SVC(),
param_grid={'C': [0.1, 1, 10], 'degree': [1, 2, 3],
'gamma': [1, 0.1, 0.01],
'kernel': ['rbf', 'sigmoid']},
scoring='accuracy', verbose=1)SVC()
SVC()
# Model Scores on training and test set
print("Training Set score:", svc_gscv.score(x_train, y_train))
print("Test Set score:", svc_gscv.score(x_test, y_test))
Training Set score: 0.83399209486166 Test Set score: 0.8110236220472441
#printing best parameter after tuning
print("GridSearch CV Best Parameters:", svc_gscv.best_params_)
#printing how our model looks after hyper-parameter tuning
print("\nGridSearch CV Best Estimator:", svc_gscv.best_estimator_)
# best score achieved during the GridSearchCV
print("\nGridSearch CV Best score:", svc_gscv.best_score_)
cv_svc_gscv_b = svc_gscv.best_score_
GridSearch CV Best Parameters: {'C': 10, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}
GridSearch CV Best Estimator: SVC(C=10, degree=1, gamma=1)
GridSearch CV Best score: 0.8270821199767037
# Prediction on Testing Data
y_pred_svc = svc_gscv.predict(x_test)
svc_gscv_accuracy = metrics.accuracy_score(y_test, y_pred_svc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svc))
Accuracy: 0.8110236220472441
# Prediction on Training Data
y_pred2_svc = svc_gscv.predict(x_train)
svc_gscv_taccuracy = metrics.accuracy_score(y_train, y_pred2_svc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_svc))
Accuracy: 0.83399209486166
confusion_matrix_svc = metrics.confusion_matrix(y_test, y_pred_svc)
confusion_matrix_svc
array([[25, 9, 1],
[ 8, 84, 17],
[ 0, 13, 97]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_svc, 1)
confusion_matrix_svc_percent = confusion_matrix_svc.astype('float') / confusion_matrix_svc.sum(axis=1)[:, np.newaxis]
confusion_matrix_svc_percent
array([[0.71428571, 0.25714286, 0.02857143],
[0.0733945 , 0.7706422 , 0.1559633 ],
[0. , 0.11818182, 0.88181818]])
Confusion_Matrix_Plotter(confusion_matrix_svc_percent, 0)
print(classification_report(y_test, y_pred_svc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.76 0.71 0.74 35
Medium Price 0.79 0.77 0.78 109
High Price 0.84 0.88 0.86 110
accuracy 0.81 254
macro avg 0.80 0.79 0.79 254
weighted avg 0.81 0.81 0.81 254
svc_gs_t1_l, svc_gs_t2_l, svc_gs_t1_m, svc_gs_t2_m, svc_gs_t1_h, svc_gs_t2_h = Compute_Error(confusion_matrix_svc)
Type1_Error_LowPrice: 8 Type2_Error_LowPrice: 10 Type1_Error_MediumPrice: 22 Type2_Error_MediumPrice: 25 Type1_Error_HighPrice: 18 Type2_Error_HighPrice: 13
svc_gs_pl, svc_gs_pm, svc_gs_ph = precision_score(y_test, y_pred_svc, average=None)
svc_gs_rl, svc_gs_rm, svc_gs_rh = recall_score(y_test, y_pred_svc, average=None)
svc_gs_fl, svc_gs_fm, svc_gs_fh = f1_score(y_test, y_pred_svc, average=None)
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
# Model Scores on training and test set
print("Training Set score:", logreg.score(x_train, y_train))
print("Test Set score:", logreg.score(x_test, y_test))
Training Set score: 0.7529644268774703 Test Set score: 0.7283464566929134
# Prediction on Testing Data
y_pred_lr = logreg.predict(x_test)
lr_accuracy = metrics.accuracy_score(y_test, y_pred_lr)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_lr))
Accuracy: 0.7283464566929134
# Prediction on Training Data
y_pred2_lr = logreg.predict(x_train)
lr_taccuracy = metrics.accuracy_score(y_train, y_pred2_lr)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_lr))
Accuracy: 0.7529644268774703
confusion_matrix_lr = metrics.confusion_matrix(y_test, y_pred_lr)
confusion_matrix_lr
array([[16, 18, 1],
[ 3, 87, 19],
[ 0, 28, 82]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_lr, 1)
confusion_matrix_lr_percent = confusion_matrix_lr.astype('float') / confusion_matrix_lr.sum(axis=1)[:, np.newaxis]
confusion_matrix_lr_percent
array([[0.45714286, 0.51428571, 0.02857143],
[0.02752294, 0.79816514, 0.17431193],
[0. , 0.25454545, 0.74545455]])
Confusion_Matrix_Plotter(confusion_matrix_lr_percent, 0)
print(classification_report(y_test, y_pred_lr, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.84 0.46 0.59 35
Medium Price 0.65 0.80 0.72 109
High Price 0.80 0.75 0.77 110
accuracy 0.73 254
macro avg 0.77 0.67 0.70 254
weighted avg 0.74 0.73 0.73 254
lr_t1_l, lr_t2_l, lr_t1_m, lr_t2_m, lr_t1_h, lr_t2_h = Compute_Error(confusion_matrix_lr)
Type1_Error_LowPrice: 3 Type2_Error_LowPrice: 19 Type1_Error_MediumPrice: 46 Type2_Error_MediumPrice: 22 Type1_Error_HighPrice: 20 Type2_Error_HighPrice: 28
lr_pl, lr_pm, lr_ph = precision_score(y_test, y_pred_lr, average=None)
lr_rl, lr_rm, lr_rh = recall_score(y_test, y_pred_lr, average=None)
lr_fl, lr_fm, lr_fh = f1_score(y_test, y_pred_lr, average=None)
cv_lr = cross_val_score(logreg, x_train, y_train, cv = 10, scoring='accuracy')
cv_lr
array([0.80392157, 0.70588235, 0.7029703 , 0.79207921, 0.72277228,
0.69306931, 0.75247525, 0.72277228, 0.76237624, 0.76237624])
cv_lr_m = cv_lr.mean()
print("Cross Validation Score:", cv_lr_m)
Cross Validation Score: 0.7420695010677537
plot_learning_curves(x_train, y_train, x_test, y_test, logreg)
plt.show()
Applying GridSearchCV.
logreg.get_params()
{'C': 1.0,
'class_weight': None,
'dual': False,
'fit_intercept': True,
'intercept_scaling': 1,
'l1_ratio': None,
'max_iter': 100,
'multi_class': 'auto',
'n_jobs': None,
'penalty': 'l2',
'random_state': None,
'solver': 'lbfgs',
'tol': 0.0001,
'verbose': 0,
'warm_start': False}
parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-3, 3, 7), 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
lr_gs = GridSearchCV(estimator = logreg, param_grid = parameters, scoring = 'accuracy', cv = 10, verbose=1)
lr_gs.fit(x_train, y_train)
Fitting 10 folds for each of 42 candidates, totalling 420 fits
GridSearchCV(cv=10, estimator=LogisticRegression(),
param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
'penalty': ['l1', 'l2'],
'solver': ['newton-cg', 'lbfgs', 'liblinear']},
scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=LogisticRegression(),
param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
'penalty': ['l1', 'l2'],
'solver': ['newton-cg', 'lbfgs', 'liblinear']},
scoring='accuracy', verbose=1)LogisticRegression()
LogisticRegression()
print("GridSearch CV Best Parameters:", lr_gs.best_params_)
print("\nGridSearch CV Best Estimator:", lr_gs.best_estimator_)
print("\nGridSearch CV Best score:", lr_gs.best_score_)
cv_lr_gs_b = lr_gs.best_score_
GridSearch CV Best Parameters: {'C': 1000.0, 'penalty': 'l2', 'solver': 'newton-cg'}
GridSearch CV Best Estimator: LogisticRegression(C=1000.0, solver='newton-cg')
GridSearch CV Best score: 0.7865657153950689
# Prediction on Testing Data
y_pred_lr = lr_gs.predict(x_test)
lr_gs_accuracy = metrics.accuracy_score(y_test, y_pred_lr)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_lr))
Accuracy: 0.7559055118110236
# Prediction on Training Data
y_pred2_lr = lr_gs.predict(x_train)
lr_gs_taccuracy = metrics.accuracy_score(y_train, y_pred2_lr)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_lr))
Accuracy: 0.791501976284585
confusion_matrix_lr = metrics.confusion_matrix(y_test, y_pred_lr)
confusion_matrix_lr
array([[22, 12, 1],
[ 6, 86, 17],
[ 0, 26, 84]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_lr, 1)
confusion_matrix_lr_percent = confusion_matrix_lr.astype('float') / confusion_matrix_lr.sum(axis=1)[:, np.newaxis]
confusion_matrix_lr_percent
array([[0.62857143, 0.34285714, 0.02857143],
[0.05504587, 0.78899083, 0.1559633 ],
[0. , 0.23636364, 0.76363636]])
Confusion_Matrix_Plotter(confusion_matrix_lr_percent, 0)
print(classification_report(y_test, y_pred_lr, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.79 0.63 0.70 35
Medium Price 0.69 0.79 0.74 109
High Price 0.82 0.76 0.79 110
accuracy 0.76 254
macro avg 0.77 0.73 0.74 254
weighted avg 0.76 0.76 0.76 254
lr_gs_t1_l, lr_gs_t2_l, lr_gs_t1_m, lr_gs_t2_m, lr_gs_t1_h, lr_gs_t2_h = Compute_Error(confusion_matrix_lr)
Type1_Error_LowPrice: 6 Type2_Error_LowPrice: 13 Type1_Error_MediumPrice: 38 Type2_Error_MediumPrice: 23 Type1_Error_HighPrice: 18 Type2_Error_HighPrice: 26
lr_gs_pl, lr_gs_pm, lr_gs_ph = precision_score(y_test, y_pred_lr, average=None)
lr_gs_rl, lr_gs_rm, lr_gs_rh = recall_score(y_test, y_pred_lr, average=None)
lr_gs_fl, lr_gs_fm, lr_gs_fh = f1_score(y_test, y_pred_lr, average=None)
plot_learning_curves(x_train, y_train, x_test, y_test, lr_gs)
plt.show()
Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits Fitting 10 folds for each of 42 candidates, totalling 420 fits
abc = AdaBoostClassifier()
abc.fit(x_train, y_train)
AdaBoostClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier()
# Model Scores on training and test set
print("Training Set score:", abc.score(x_train, y_train))
print("Test Set score:", abc.score(x_test, y_test))
Training Set score: 0.7845849802371542 Test Set score: 0.7716535433070866
# Prediction on Testing Data
y_pred_abc = abc.predict(x_test)
abc_accuracy = metrics.accuracy_score(y_test, y_pred_abc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_abc))
Accuracy: 0.7716535433070866
# Prediction on Training Data
y_pred2_abc = abc.predict(x_train)
abc_taccuracy = metrics.accuracy_score(y_train, y_pred2_abc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_abc))
Accuracy: 0.7845849802371542
confusion_matrix_abc = metrics.confusion_matrix(y_test, y_pred_abc)
confusion_matrix_abc
array([[21, 14, 0],
[ 9, 88, 12],
[ 0, 23, 87]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_abc, 1)
confusion_matrix_abc_percent = confusion_matrix_abc.astype('float') / confusion_matrix_abc.sum(axis=1)[:, np.newaxis]
confusion_matrix_abc_percent
array([[0.6 , 0.4 , 0. ],
[0.08256881, 0.80733945, 0.11009174],
[0. , 0.20909091, 0.79090909]])
Confusion_Matrix_Plotter(confusion_matrix_abc_percent, 0)
print(classification_report(y_test, y_pred_abc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.70 0.60 0.65 35
Medium Price 0.70 0.81 0.75 109
High Price 0.88 0.79 0.83 110
accuracy 0.77 254
macro avg 0.76 0.73 0.74 254
weighted avg 0.78 0.77 0.77 254
abc_t1_l, abc_t2_l, abc_t1_m, abc_t2_m, abc_t1_h, abc_t2_h = Compute_Error(confusion_matrix_abc)
Type1_Error_LowPrice: 9 Type2_Error_LowPrice: 14 Type1_Error_MediumPrice: 37 Type2_Error_MediumPrice: 21 Type1_Error_HighPrice: 12 Type2_Error_HighPrice: 23
abc_pl, abc_pm, abc_ph = precision_score(y_test, y_pred_abc, average=None)
abc_rl, abc_rm, abc_rh = recall_score(y_test, y_pred_abc, average=None)
abc_fl, abc_fm, abc_fh = f1_score(y_test, y_pred_abc, average=None)
cv_abc = cross_val_score(abc, x_train, y_train, cv = 10, scoring='accuracy')
cv_abc
array([0.73529412, 0.80392157, 0.73267327, 0.78217822, 0.78217822,
0.76237624, 0.71287129, 0.75247525, 0.84158416, 0.84158416])
cv_abc_m = cv_abc.mean()
print("Cross Validation Score:", cv_abc_m)
Cross Validation Score: 0.7747136478353718
plot_learning_curves(x_train, y_train, x_test, y_test, abc)
plt.show()
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
Applying GridSearchCV.
knn.get_params()
{'algorithm': 'auto',
'leaf_size': 30,
'metric': 'minkowski',
'metric_params': None,
'n_jobs': None,
'n_neighbors': 5,
'p': 2,
'weights': 'uniform'}
k_range = list(range(1, 31))
grid_params = {'n_neighbors': k_range, 'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev']}
knn_gscv = GridSearchCV(KNeighborsClassifier(), grid_params, cv=10)
knn_gscv.fit(x_train, y_train)
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
param_grid={'metric': ['euclidean', 'manhattan', 'minkowski',
'chebyshev'],
'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30],
'weights': ['uniform', 'distance']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
param_grid={'metric': ['euclidean', 'manhattan', 'minkowski',
'chebyshev'],
'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30],
'weights': ['uniform', 'distance']})KNeighborsClassifier()
KNeighborsClassifier()
# Model Scores on training and test set
print("Training Set score:", knn_gscv.score(x_train, y_train))
print("Test Set score:", knn_gscv.score(x_test, y_test))
Training Set score: 1.0 Test Set score: 0.8779527559055118
print("GridSearch CV Best Parameters:", knn_gscv.best_params_)
print("\nGridSearch CV Best Estimator:", knn_gscv.best_estimator_)
print("\nGridSearch CV Best score:", knn_gscv.best_score_)
cv_knn_b = knn_gscv.best_score_
GridSearch CV Best Parameters: {'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'distance'}
GridSearch CV Best Estimator: KNeighborsClassifier(metric='manhattan', n_neighbors=6, weights='distance')
GridSearch CV Best score: 0.8873519704911667
# Prediction on Testing Data
y_pred_knn = knn_gscv.predict(x_test)
knn_accuracy = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_knn))
Accuracy: 0.8779527559055118
# Prediction on Training Data
y_pred2_knn = knn_gscv.predict(x_train)
knn_taccuracy = metrics.accuracy_score(y_train, y_pred2_knn)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_knn))
Accuracy: 1.0
confusion_matrix_knn = metrics.confusion_matrix(y_test, y_pred_knn)
confusion_matrix_knn
array([[ 30, 5, 0],
[ 10, 89, 10],
[ 0, 6, 104]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_knn, 1)
confusion_matrix_knn_percent = confusion_matrix_knn.astype('float') / confusion_matrix_knn.sum(axis=1)[:, np.newaxis]
confusion_matrix_knn_percent
array([[0.85714286, 0.14285714, 0. ],
[0.09174312, 0.81651376, 0.09174312],
[0. , 0.05454545, 0.94545455]])
Confusion_Matrix_Plotter(confusion_matrix_knn_percent, 0)
print(classification_report(y_test, y_pred_knn, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.75 0.86 0.80 35
Medium Price 0.89 0.82 0.85 109
High Price 0.91 0.95 0.93 110
accuracy 0.88 254
macro avg 0.85 0.87 0.86 254
weighted avg 0.88 0.88 0.88 254
knn_t1_l, knn_t2_l, knn_t1_m, knn_t2_m, knn_t1_h, knn_t2_h = Compute_Error(confusion_matrix_knn)
Type1_Error_LowPrice: 10 Type2_Error_LowPrice: 5 Type1_Error_MediumPrice: 11 Type2_Error_MediumPrice: 20 Type1_Error_HighPrice: 10 Type2_Error_HighPrice: 6
knn_pl, knn_pm, knn_ph = precision_score(y_test, y_pred_knn, average=None)
knn_rl, knn_rm, knn_rh = recall_score(y_test, y_pred_knn, average=None)
knn_fl, knn_fm, knn_fh = f1_score(y_test, y_pred_knn, average=None)
plot_learning_curves(x_train, y_train, x_test, y_test, knn_gscv)
plt.show()
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
# Model Scores on training and test set
print("Training Set score:", dtc.score(x_train, y_train))
print("Test Set score:", dtc.score(x_test, y_test))
Training Set score: 1.0 Test Set score: 0.8385826771653543
def Plotter(df):
plt.figure(figsize = (100, 35))
print_tree = tree.plot_tree(df,
feature_names = features,
class_names = ['1','2','3'],
rounded = True,
filled = True)
plt.show()
Plotter(dtc)
dtc_print_tree = tree.export_text(dtc, feature_names = features)
# print_tree
# Prediction on Testing Data
y_pred_dtc = dtc.predict(x_test)
dtc_accuracy = metrics.accuracy_score(y_test, y_pred_dtc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_dtc))
Accuracy: 0.8385826771653543
# Prediction on Training Data
y_pred2_dtc = dtc.predict(x_train)
dtc_taccuracy = metrics.accuracy_score(y_train, y_pred2_dtc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_dtc))
Accuracy: 1.0
confusion_matrix_dtc = metrics.confusion_matrix(y_test, y_pred_dtc)
confusion_matrix_dtc
array([[30, 5, 0],
[ 6, 89, 14],
[ 1, 15, 94]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_dtc, 1)
confusion_matrix_dtc_percent = confusion_matrix_dtc.astype('float') / confusion_matrix_dtc.sum(axis=1)[:, np.newaxis]
confusion_matrix_dtc_percent
array([[0.85714286, 0.14285714, 0. ],
[0.05504587, 0.81651376, 0.12844037],
[0.00909091, 0.13636364, 0.85454545]])
Confusion_Matrix_Plotter(confusion_matrix_dtc_percent, 0)
print(classification_report(y_test, y_pred_dtc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.81 0.86 0.83 35
Medium Price 0.82 0.82 0.82 109
High Price 0.87 0.85 0.86 110
accuracy 0.84 254
macro avg 0.83 0.84 0.84 254
weighted avg 0.84 0.84 0.84 254
dt_t1_l, dt_t2_l, dt_t1_m, dt_t2_m, dt_t1_h, dt_t2_h = Compute_Error(confusion_matrix_dtc)
Type1_Error_LowPrice: 7 Type2_Error_LowPrice: 5 Type1_Error_MediumPrice: 20 Type2_Error_MediumPrice: 20 Type1_Error_HighPrice: 14 Type2_Error_HighPrice: 16
dt_pl, dt_pm, dt_ph = precision_score(y_test, y_pred_dtc, average=None)
dt_rl, dt_rm, dt_rh = recall_score(y_test, y_pred_dtc, average=None)
dt_fl, dt_fm, dt_fh = f1_score(y_test, y_pred_dtc, average=None)
cv_dt = cross_val_score(dtc, x_train, y_train, cv = 10, scoring='accuracy')
cv_dt
array([0.81372549, 0.82352941, 0.76237624, 0.85148515, 0.88118812,
0.85148515, 0.85148515, 0.86138614, 0.87128713, 0.8019802 ])
cv_dt_m = cv_dt.mean()
print("Cross Validation Score:", cv_dt_m)
Cross Validation Score: 0.8369928169287517
feature_importance = pd.Series(dtc.feature_importances_, index = features).sort_values(ascending = False)
sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
plot_learning_curves(x_train, y_train, x_test, y_test, dtc)
plt.show()
Finding the best parameter max_leaf_nodes using GridSearchCV()
dtc.get_params()
{'ccp_alpha': 0.0,
'class_weight': None,
'criterion': 'gini',
'max_depth': None,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'random_state': None,
'splitter': 'best'}
leaf_nodes_list = list(range(1, 16))
parameters = {'max_leaf_nodes': leaf_nodes_list, 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 3, 4, 5, 6]}
dt_gscv = GridSearchCV(estimator = dtc, param_grid = parameters, scoring = 'accuracy', cv = 10)
dt_gscv = dt_gscv.fit(x_train, y_train)
print("Best Parameters:", dt_gscv.best_params_)
Best Parameters: {'criterion': 'gini', 'max_leaf_nodes': 11, 'min_samples_leaf': 6, 'min_samples_split': 2}
print("GridSearch CV Best Parameters:", dt_gscv.best_params_)
print("\nGridSearch CV Best Estimator:", dt_gscv.best_estimator_)
print("\nGridSearch CV Best score:", dt_gscv.best_score_)
cv_dt_gscv_b = dt_gscv.best_score_
GridSearch CV Best Parameters: {'criterion': 'gini', 'max_leaf_nodes': 11, 'min_samples_leaf': 6, 'min_samples_split': 2}
GridSearch CV Best Estimator: DecisionTreeClassifier(max_leaf_nodes=11, min_samples_leaf=6)
GridSearch CV Best score: 0.8300524170064065
nleaf_list = []
score_list = []
for i in range(2, 16):
nleaf_list.append(i)
parameters = {'max_leaf_nodes': [i]}
grid_search = GridSearchCV(estimator = dtc, param_grid = parameters, scoring = 'accuracy', cv = 10)
grid_search = grid_search.fit(x_train, y_train)
score_list.append(grid_search.best_score_)
# Plot of tree sizes VS classification rate.
plt.plot(nleaf_list, score_list)
plt.scatter(nleaf_list, score_list)
plt.title("Plot of Tree Size VS Classification Rate")
# plt.grid(True)
Text(0.5, 1.0, 'Plot of Tree Size VS Classification Rate')
Plotting the Pruned Tree
dt_gscv.fit(x_train, y_train)
GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15],
'min_samples_leaf': [1, 2, 3, 4, 5, 6],
'min_samples_split': [2, 5, 10]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15],
'min_samples_leaf': [1, 2, 3, 4, 5, 6],
'min_samples_split': [2, 5, 10]},
scoring='accuracy')DecisionTreeClassifier()
DecisionTreeClassifier()
dtc_pt = dt_gscv
dtc_pt2 = DecisionTreeClassifier(max_leaf_nodes = 11, min_samples_leaf = 6)
dtc_pt2.fit(x_train, y_train)
Plotter(dtc_pt2)
# Prediction on Testing Data
y_pred_dtc_pt = dtc_pt.predict(x_test)
dtc_pt_accuracy = metrics.accuracy_score(y_test, y_pred_dtc_pt)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_dtc_pt))
Accuracy: 0.8188976377952756
# Prediction on Training Data
y_pred2_dtc_pt = dtc_pt.predict(x_train)
dtc_pt_taccuracy = metrics.accuracy_score(y_train, y_pred2_dtc_pt)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_dtc_pt))
Accuracy: 0.8507905138339921
confusion_matrix_dtc_pt = metrics.confusion_matrix(y_test, y_pred_dtc_pt)
confusion_matrix_dtc_pt
array([[ 29, 5, 1],
[ 11, 75, 23],
[ 0, 6, 104]], dtype=int64)
# cm_df = pd.DataFrame(confusion_matrix_dtc_pt)
# sns.heatmap(cm_df, annot=True, fmt="d")
# plt.tight_layout()
Confusion_Matrix_Plotter(confusion_matrix_dtc_pt, 1)
confusion_matrix_dtc_pt_percent = confusion_matrix_dtc_pt.astype('float') / confusion_matrix_dtc_pt.sum(axis=1)[:, np.newaxis]
confusion_matrix_dtc_pt_percent
array([[0.82857143, 0.14285714, 0.02857143],
[0.10091743, 0.68807339, 0.21100917],
[0. , 0.05454545, 0.94545455]])
Confusion_Matrix_Plotter(confusion_matrix_dtc_pt_percent, 0)
print(classification_report(y_test, y_pred_dtc_pt, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.72 0.83 0.77 35
Medium Price 0.87 0.69 0.77 109
High Price 0.81 0.95 0.87 110
accuracy 0.82 254
macro avg 0.80 0.82 0.81 254
weighted avg 0.83 0.82 0.82 254
dtp_t1_l, dtp_t2_l, dtp_t1_m, dtp_t2_m, dtp_t1_h, dtp_t2_h = Compute_Error(confusion_matrix_dtc_pt)
Type1_Error_LowPrice: 11 Type2_Error_LowPrice: 6 Type1_Error_MediumPrice: 11 Type2_Error_MediumPrice: 34 Type1_Error_HighPrice: 24 Type2_Error_HighPrice: 6
dtp_pl, dtp_pm, dtp_ph = precision_score(y_test, y_pred_dtc_pt, average=None)
dtp_rl, dtp_rm, dtp_rh = recall_score(y_test, y_pred_dtc_pt, average=None)
dtp_fl, dtp_fm, dtp_fh = f1_score(y_test, y_pred_dtc_pt, average=None)
feature_importance = pd.Series(dtc_pt2.feature_importances_, index = features).sort_values(ascending = False)
sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
plot_learning_curves(x_train, y_train, x_test, y_test, dtc_pt2) #dtc_pt2 dt_gscv
plt.show()
Training a Random Forest with the best parameter max_leaf_nodes using RandomForestClassifier()
rfc = RandomForestClassifier(max_leaf_nodes = 11, n_estimators = 100)
rfc.fit(x_train, y_train)
RandomForestClassifier(max_leaf_nodes=11)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_leaf_nodes=11)
# Model Scores on training and test set
print("Training Set score:", rfc.score(x_train, y_train))
print("Test Set score:", rfc.score(x_test, y_test))
Training Set score: 0.8596837944664032 Test Set score: 0.8307086614173228
# Prediction on Testing Data
y_pred_rfc = rfc.predict(x_test)
rfc_accuracy = metrics.accuracy_score(y_test, y_pred_rfc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rfc))
Accuracy: 0.8307086614173228
# Prediction on Training Data
y_pred2_rfc = rfc.predict(x_train)
rfc_taccuracy = metrics.accuracy_score(y_train, y_pred2_rfc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_rfc))
Accuracy: 0.8596837944664032
confusion_matrix_rfc = metrics.confusion_matrix(y_test, y_pred_rfc)
confusion_matrix_rfc
array([[ 27, 7, 1],
[ 7, 82, 20],
[ 0, 8, 102]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_rfc, 1)
confusion_matrix_rfc_percent = confusion_matrix_rfc.astype('float') / confusion_matrix_rfc.sum(axis=1)[:, np.newaxis]
confusion_matrix_rfc_percent
array([[0.77142857, 0.2 , 0.02857143],
[0.06422018, 0.75229358, 0.18348624],
[0. , 0.07272727, 0.92727273]])
Confusion_Matrix_Plotter(confusion_matrix_rfc_percent, 0)
print(classification_report(y_test, y_pred_rfc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.79 0.77 0.78 35
Medium Price 0.85 0.75 0.80 109
High Price 0.83 0.93 0.88 110
accuracy 0.83 254
macro avg 0.82 0.82 0.82 254
weighted avg 0.83 0.83 0.83 254
rfc_t1_l, rfc_t2_l, rfc_t1_m, rfc_t2_m, rfc_t1_h, rfc_t2_h = Compute_Error(confusion_matrix_rfc)
Type1_Error_LowPrice: 7 Type2_Error_LowPrice: 8 Type1_Error_MediumPrice: 15 Type2_Error_MediumPrice: 27 Type1_Error_HighPrice: 21 Type2_Error_HighPrice: 8
rfc_pl, rfc_pm, rfc_ph = precision_score(y_test, y_pred_rfc, average=None)
rfc_rl, rfc_rm, rfc_rh = recall_score(y_test, y_pred_rfc, average=None)
rfc_fl, rfc_fm, rfc_fh = f1_score(y_test, y_pred_rfc, average=None)
cv_rfc = cross_val_score(rfc, x_train, y_train, cv = 10, scoring='accuracy')
cv_rfc
array([0.85294118, 0.78431373, 0.8019802 , 0.9009901 , 0.82178218,
0.84158416, 0.81188119, 0.82178218, 0.85148515, 0.81188119])
cv_rfc_m = cv_rfc.mean()
print("Cross Validation Score:", cv_rfc_m)
Cross Validation Score: 0.8300621238594449
feature_importance = pd.Series(rfc.feature_importances_, index = features).sort_values(ascending = False)
sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
plot_learning_curves(x_train, y_train, x_test, y_test, rfc)
plt.show()
rfc.get_params()
{'bootstrap': True,
'ccp_alpha': 0.0,
'class_weight': None,
'criterion': 'gini',
'max_depth': None,
'max_features': 'sqrt',
'max_leaf_nodes': 11,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_jobs': None,
'oob_score': False,
'random_state': None,
'verbose': 0,
'warm_start': False}
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators
[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_depth
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]
leaf_nodes_list = list(range(1, 16))
random_grid = {'n_estimators': n_estimators, 'max_features': ['auto', 'sqrt'],
'max_depth': max_depth, 'max_leaf_nodes': leaf_nodes_list,
'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6], 'bootstrap': [True, False]}
rfc_rscv = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, cv = 10,
verbose=1, n_jobs = -1)
rfc_rscv.fit(x_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60,
70, 80, 90, 100, 110,
None],
'max_features': ['auto', 'sqrt'],
'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13,
14, 15],
'min_samples_leaf': [1, 2, 3, 4, 5, 6],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800,
1000, 1200, 1400, 1600,
1800, 2000]},
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60,
70, 80, 90, 100, 110,
None],
'max_features': ['auto', 'sqrt'],
'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13,
14, 15],
'min_samples_leaf': [1, 2, 3, 4, 5, 6],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800,
1000, 1200, 1400, 1600,
1800, 2000]},
verbose=1)RandomForestClassifier()
RandomForestClassifier()
print("GridSearch CV Best Parameters:", rfc_rscv.best_params_)
print("\nGridSearch CV Best Estimator:", rfc_rscv.best_estimator_)
print("\nGridSearch CV Best score:", rfc_rscv.best_score_)
cv_rfc_rscv_b = rfc_rscv.best_score_
GridSearch CV Best Parameters: {'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_leaf_nodes': 14, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
GridSearch CV Best Estimator: RandomForestClassifier(bootstrap=False, max_features='auto', max_leaf_nodes=14,
min_samples_leaf=4, min_samples_split=5,
n_estimators=1600)
GridSearch CV Best score: 0.8350223257619879
# Model Scores on training and test set
print("Training Set score:", rfc_rscv.score(x_train, y_train))
print("Test Set score:", rfc_rscv.score(x_test, y_test))
Training Set score: 0.8616600790513834 Test Set score: 0.8267716535433071
# Prediction on Testing Data
y_pred_rfc_rscv = rfc_rscv.predict(x_test)
rfc_rscv_accuracy = metrics.accuracy_score(y_test, y_pred_rfc_rscv)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rfc_rscv))
Accuracy: 0.8267716535433071
# Prediction on Training Data
y_pred2_rfc_rscv = rfc_rscv.predict(x_train)
rfc_rscv_taccuracy = metrics.accuracy_score(y_train, y_pred2_rfc_rscv)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_rfc_rscv))
Accuracy: 0.8616600790513834
confusion_matrix_rfc_rscv = metrics.confusion_matrix(y_test, y_pred_rfc_rscv)
confusion_matrix_rfc_rscv
array([[ 27, 7, 1],
[ 8, 80, 21],
[ 0, 7, 103]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_rfc_rscv, 1)
confusion_matrix_rfc_rscv_percent = confusion_matrix_rfc_rscv.astype('float') / confusion_matrix_rfc_rscv.sum(axis=1)[:, np.newaxis]
confusion_matrix_rfc_rscv_percent
array([[0.77142857, 0.2 , 0.02857143],
[0.0733945 , 0.73394495, 0.19266055],
[0. , 0.06363636, 0.93636364]])
Confusion_Matrix_Plotter(confusion_matrix_rfc_rscv_percent, 0)
print(classification_report(y_test, y_pred_rfc_rscv, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.77 0.77 0.77 35
Medium Price 0.85 0.73 0.79 109
High Price 0.82 0.94 0.88 110
accuracy 0.83 254
macro avg 0.82 0.81 0.81 254
weighted avg 0.83 0.83 0.82 254
rfc_rscv_t1_l, rfc_rscv_t2_l, rfc_rscv_t1_m, rfc_rscv_t2_m, rfc_rscv_t1_h, rfc_rscv_t2_h = Compute_Error(confusion_matrix_rfc_rscv)
Type1_Error_LowPrice: 8 Type2_Error_LowPrice: 8 Type1_Error_MediumPrice: 14 Type2_Error_MediumPrice: 29 Type1_Error_HighPrice: 22 Type2_Error_HighPrice: 7
rfc_rscv_pl, rfc_rscv_pm, rfc_rscv_ph = precision_score(y_test, y_pred_rfc_rscv, average=None)
rfc_rscv_rl, rfc_rscv_rm, rfc_rscv_rh = recall_score(y_test, y_pred_rfc_rscv, average=None)
rfc_rscv_fl, rfc_rscv_fm, rfc_rscv_fh = f1_score(y_test, y_pred_rfc_rscv, average=None)
rfc2 = RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5, min_samples_split=10, n_estimators=400)
rfc2.fit(x_train, y_train)
RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5,
min_samples_split=10, n_estimators=400)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5,
min_samples_split=10, n_estimators=400)feature_importance = pd.Series(rfc2.feature_importances_, index = features).sort_values(ascending = False)
sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
# plot_learning_curves(x_train, y_train, x_test, y_test, rfc_rscv)
# plt.show()
hidden_layer_sizes : This parameter allows us to set the number of layers and the number of nodes we wish to have in the Neural Network Classifier. Each element in the tuple represents the number of nodes at the ith position where i is the index of the tuple. Thus the length of tuple denotes the total number of hidden layers in the network.
mlp = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(48, 24, 12), alpha=1e-06, max_iter=5000)
mlp.fit(x_train, y_train)
MLPClassifier(activation='logistic', alpha=1e-06,
hidden_layer_sizes=(48, 24, 12), max_iter=5000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. MLPClassifier(activation='logistic', alpha=1e-06,
hidden_layer_sizes=(48, 24, 12), max_iter=5000)# Prediction on Testing Data
y_pred_mlp = mlp.predict(x_test)
# Accuracy Score = (TP + TN)/ (TP + FN + TN + FP)
mlp_accuracy = metrics.accuracy_score(y_test, y_pred_mlp)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_mlp))
Accuracy: 0.42913385826771655
# Prediction on Training Data
y_pred2_mlp = mlp.predict(x_train)
mlp_taccuracy = metrics.accuracy_score(y_train, y_pred2_mlp)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_mlp))
Accuracy: 0.4426877470355731
# Mean accuracy on the given test data and label
mlp.score(x_test, y_pred_mlp)
1.0
# Model Scores on training and test set
print("Training Set score:", mlp.score(x_train, y_train))
print("Test Set score:", mlp.score(x_test, y_test))
Training Set score: 0.4426877470355731 Test Set score: 0.42913385826771655
y_pred_mlp
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
print("Number of layers:", mlp.n_layers_)
print("Number of iterations the solver has run:", mlp.n_iter_)
print("Computed Loss:", mlp.loss_)
print("Minimum loss reached by the solver throughout fitting:", mlp.best_loss_)
print("Number of features seen during fit:", mlp.n_features_in_)
print("Output activation function:", mlp.out_activation_) #logistic sigmoid function: returns f(x) = 1 / (1 + exp(-x)).
Number of layers: 5 Number of iterations the solver has run: 28 Computed Loss: 1.0343383110256572 Minimum loss reached by the solver throughout fitting: 1.0335607958416024 Number of features seen during fit: 5 Output activation function: softmax
confusion_matrix_mlp = metrics.confusion_matrix(y_test, y_pred_mlp)
confusion_matrix_mlp
array([[ 0, 35, 0],
[ 0, 109, 0],
[ 0, 110, 0]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_mlp, 1)
confusion_matrix_mlp_percent = confusion_matrix_mlp.astype('float') / confusion_matrix_mlp.sum(axis=1)[:, np.newaxis]
confusion_matrix_mlp_percent
array([[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.]])
Confusion_Matrix_Plotter(confusion_matrix_mlp_percent, 0)
print(classification_report(y_test, y_pred_mlp, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.00 0.00 0.00 35
Medium Price 0.43 1.00 0.60 109
High Price 0.00 0.00 0.00 110
accuracy 0.43 254
macro avg 0.14 0.33 0.20 254
weighted avg 0.18 0.43 0.26 254
mlp1_t1_l, mlp1_t2_l, mlp1_t1_m, mlp1_t2_m, mlp1_t1_h, mlp1_t2_h = Compute_Error(confusion_matrix_mlp)
Type1_Error_LowPrice: 0 Type2_Error_LowPrice: 35 Type1_Error_MediumPrice: 145 Type2_Error_MediumPrice: 0 Type1_Error_HighPrice: 0 Type2_Error_HighPrice: 110
mlp1_pl, mlp1_pm, mlp1_ph = precision_score(y_test, y_pred_mlp, average=None)
mlp1_rl, mlp1_rm, mlp1_rh = recall_score(y_test, y_pred_mlp, average=None)
mlp1_fl, mlp1_fm, mlp1_fh = f1_score(y_test, y_pred_mlp, average=None)
plt.plot(mlp.loss_curve_)
plt.title("Loss Curve")
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()
Neural Network with a different network structure
mlp2 = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(500, 250), alpha=1e-08, max_iter=5000)
mlp2.fit(x_train, y_train)
MLPClassifier(activation='logistic', alpha=1e-08, hidden_layer_sizes=(500, 250),
max_iter=5000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. MLPClassifier(activation='logistic', alpha=1e-08, hidden_layer_sizes=(500, 250),
max_iter=5000)# Prediction on Testing Data
y_pred_mlp2 = mlp2.predict(x_test)
mlp2_accuracy = metrics.accuracy_score(y_test, y_pred_mlp2)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_mlp2))
Accuracy: 0.7047244094488189
# Prediction on Training Data
y_pred2_mlp2 = mlp2.predict(x_train)
mlp2_taccuracy = metrics.accuracy_score(y_train, y_pred2_mlp2)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_mlp2))
Accuracy: 0.733201581027668
# Mean accuracy on the given test data and label
mlp2.score(x_test, y_pred_mlp2)
1.0
# Model Scores on training and test set
print("Training Set score:", mlp2.score(x_train, y_train))
print("Test Set score:", mlp2.score(x_test, y_test))
Training Set score: 0.733201581027668 Test Set score: 0.7047244094488189
y_pred_mlp2
array([1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1,
0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 2, 1, 2, 2, 1, 1, 1, 1,
2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 0, 2, 1, 2, 1, 0, 1, 1, 2,
2, 1, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1,
1, 1, 1, 0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2,
1, 1, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 2, 1, 2, 1, 0, 1,
1, 2, 0, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2,
2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1,
1, 2, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1,
2, 0, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1,
1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1,
2, 0, 1, 2, 2, 1, 1, 1, 2, 2, 0, 2], dtype=int64)
print("Number of layers:", mlp2.n_layers_)
print("Number of iterations the solver has run:", mlp2.n_iter_)
print("Computed Loss:", mlp2.loss_)
print("Minimum loss reached by the solver throughout fitting:", mlp2.best_loss_)
print("Number of features seen during fit:", mlp2.n_features_in_)
print("Output activation function:", mlp2.out_activation_)
Number of layers: 4 Number of iterations the solver has run: 93 Computed Loss: 0.6468561423488488 Minimum loss reached by the solver throughout fitting: 0.5997767700479348 Number of features seen during fit: 5 Output activation function: softmax
confusion_matrix_mlp2 = metrics.confusion_matrix(y_test, y_pred_mlp2)
confusion_matrix_mlp2
array([[15, 19, 1],
[ 4, 90, 15],
[ 0, 36, 74]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_mlp2, 1)
confusion_matrix_mlp2_percent = confusion_matrix_mlp2.astype('float') / confusion_matrix_mlp2.sum(axis=1)[:, np.newaxis]
confusion_matrix_mlp2_percent
array([[0.42857143, 0.54285714, 0.02857143],
[0.03669725, 0.82568807, 0.13761468],
[0. , 0.32727273, 0.67272727]])
Confusion_Matrix_Plotter(confusion_matrix_mlp2_percent, 0)
print(classification_report(y_test, y_pred_mlp2, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.79 0.43 0.56 35
Medium Price 0.62 0.83 0.71 109
High Price 0.82 0.67 0.74 110
accuracy 0.70 254
macro avg 0.74 0.64 0.67 254
weighted avg 0.73 0.70 0.70 254
mlp2_t1_l, mlp2_t2_l, mlp2_t1_m, mlp2_t2_m, mlp2_t1_h, mlp2_t2_h = Compute_Error(confusion_matrix_mlp2)
Type1_Error_LowPrice: 4 Type2_Error_LowPrice: 20 Type1_Error_MediumPrice: 55 Type2_Error_MediumPrice: 19 Type1_Error_HighPrice: 16 Type2_Error_HighPrice: 36
mlp2_pl, mlp2_pm, mlp2_ph = precision_score(y_test, y_pred_mlp2, average=None)
mlp2_rl, mlp2_rm, mlp2_rh = recall_score(y_test, y_pred_mlp2, average=None)
mlp2_fl, mlp2_fm, mlp2_fh = f1_score(y_test, y_pred_mlp2, average=None)
plt.plot(mlp2.loss_curve_)
plt.title("Loss Curve")
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()
improvement_mlp = mlp2_accuracy - mlp_accuracy
improvement_mlp * 100
27.55905511811023
from xgboost import XGBClassifier
xgbc = XGBClassifier()
print(xgbc)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)
xgbc.fit(x_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
predictor='auto', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
predictor='auto', ...)# Model Scores on training and test set
print("Training Set score:", xgbc.score(x_train, y_train))
print("Test Set score:", xgbc.score(x_test, y_test))
Training Set score: 1.0 Test Set score: 0.8622047244094488
# Prediction on Testing Data
y_pred_xgbc = xgbc.predict(x_test)
xgbc_accuracy = metrics.accuracy_score(y_test, y_pred_xgbc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_xgbc))
Accuracy: 0.8622047244094488
# Prediction on Training Data
y_pred2_xgbc = xgbc.predict(x_train)
xgbc_taccuracy = metrics.accuracy_score(y_train, y_pred2_xgbc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_xgbc))
Accuracy: 1.0
confusion_matrix_xgbc = metrics.confusion_matrix(y_test, y_pred_xgbc)
confusion_matrix_xgbc
array([[ 29, 6, 0],
[ 10, 87, 12],
[ 0, 7, 103]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_xgbc, 1)
confusion_matrix_xgbc_percent = confusion_matrix_xgbc.astype('float') / confusion_matrix_xgbc.sum(axis=1)[:, np.newaxis]
confusion_matrix_xgbc_percent
array([[0.82857143, 0.17142857, 0. ],
[0.09174312, 0.79816514, 0.11009174],
[0. , 0.06363636, 0.93636364]])
Confusion_Matrix_Plotter(confusion_matrix_xgbc_percent, 0)
print(classification_report(y_test, y_pred_xgbc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.74 0.83 0.78 35
Medium Price 0.87 0.80 0.83 109
High Price 0.90 0.94 0.92 110
accuracy 0.86 254
macro avg 0.84 0.85 0.84 254
weighted avg 0.86 0.86 0.86 254
xgbc_t1_l, xgbc_t2_l, xgbc_t1_m, xgbc_t2_m, xgbc_t1_h, xgbc_t2_h = Compute_Error(confusion_matrix_xgbc)
Type1_Error_LowPrice: 10 Type2_Error_LowPrice: 6 Type1_Error_MediumPrice: 13 Type2_Error_MediumPrice: 22 Type1_Error_HighPrice: 12 Type2_Error_HighPrice: 7
xgbc_pl, xgbc_pm, xgbc_ph = precision_score(y_test, y_pred_xgbc, average=None)
xgbc_rl, xgbc_rm, xgbc_rh = recall_score(y_test, y_pred_xgbc, average=None)
xgbc_fl, xgbc_fm, xgbc_fh = f1_score(y_test, y_pred_xgbc, average=None)
cv_xgbc = cross_val_score(xgbc, x_train, y_train, cv = 10, scoring='accuracy')
cv_xgbc
array([0.8627451 , 0.83333333, 0.84158416, 0.91089109, 0.88118812,
0.89108911, 0.87128713, 0.88118812, 0.89108911, 0.86138614])
cv_xgbc_m = cv_xgbc.mean()
print("Cross Validation Score:", cv_xgbc_m)
Cross Validation Score: 0.8725781401669579
plot_learning_curves(x_train, y_train, x_test, y_test, xgbc)
plt.show()
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.get_params()
{'alpha': 0.0001,
'average': False,
'class_weight': None,
'early_stopping': False,
'epsilon': 0.1,
'eta0': 0.0,
'fit_intercept': True,
'l1_ratio': 0.15,
'learning_rate': 'optimal',
'loss': 'hinge',
'max_iter': 1000,
'n_iter_no_change': 5,
'n_jobs': None,
'penalty': 'l2',
'power_t': 0.5,
'random_state': None,
'shuffle': True,
'tol': 0.001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
sgd.fit(x_train, y_train)
SGDClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SGDClassifier()
# Model Scores on training and test set
print("Training Set score:", sgd.score(x_train, y_train))
print("Test Set score:", sgd.score(x_test, y_test))
Training Set score: 0.7262845849802372 Test Set score: 0.7165354330708661
# Prediction on Testing Data
y_pred_sgd = sgd.predict(x_test)
sgd_accuracy = metrics.accuracy_score(y_test, y_pred_sgd)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_sgd))
Accuracy: 0.7165354330708661
# Prediction on Training Data
y_pred2_sgd = sgd.predict(x_train)
sgd_taccuracy = metrics.accuracy_score(y_train, y_pred2_sgd)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_sgd))
Accuracy: 0.7262845849802372
confusion_matrix_sgd = metrics.confusion_matrix(y_test, y_pred_sgd)
confusion_matrix_sgd
array([[10, 24, 1],
[ 5, 83, 21],
[ 0, 21, 89]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_sgd, 1)
confusion_matrix_sgdc_percent = confusion_matrix_sgd.astype('float') / confusion_matrix_sgd.sum(axis=1)[:, np.newaxis]
confusion_matrix_sgdc_percent
array([[0.28571429, 0.68571429, 0.02857143],
[0.04587156, 0.76146789, 0.19266055],
[0. , 0.19090909, 0.80909091]])
Confusion_Matrix_Plotter(confusion_matrix_sgdc_percent, 0)
print(classification_report(y_test, y_pred_sgd, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.67 0.29 0.40 35
Medium Price 0.65 0.76 0.70 109
High Price 0.80 0.81 0.81 110
accuracy 0.72 254
macro avg 0.71 0.62 0.64 254
weighted avg 0.72 0.72 0.70 254
sgd_t1_l, sgd_t2_l, sgd_t1_m, sgd_t2_m, sgd_t1_h, sgd_t2_h = Compute_Error(confusion_matrix_sgd)
Type1_Error_LowPrice: 5 Type2_Error_LowPrice: 25 Type1_Error_MediumPrice: 45 Type2_Error_MediumPrice: 26 Type1_Error_HighPrice: 22 Type2_Error_HighPrice: 21
sgd_pl, sgd_pm, sgd_ph = precision_score(y_test, y_pred_sgd, average=None)
sgd_rl, sgd_rm, sgd_rh = recall_score(y_test, y_pred_sgd, average=None)
sgd_fl, sgd_fm, sgd_fh = f1_score(y_test, y_pred_sgd, average=None)
cv_sgd = cross_val_score(sgd, x_train, y_train, cv = 10, scoring='accuracy')
cv_sgd
array([0.7745098 , 0.74509804, 0.67326733, 0.76237624, 0.73267327,
0.7029703 , 0.81188119, 0.74257426, 0.73267327, 0.73267327])
cv_sgd_m = cv_sgd.mean()
print("Cross Validation Score:", cv_sgd_m)
Cross Validation Score: 0.7410696952048145
plot_learning_curves(x_train, y_train, x_test, y_test, sgd)
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.get_params()
{'ccp_alpha': 0.0,
'criterion': 'friedman_mse',
'init': None,
'learning_rate': 0.1,
'loss': 'log_loss',
'max_depth': 3,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'random_state': None,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
gbc.fit(x_train, y_train)
GradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier()
# Model Scores on training and test set
print("Training Set score:", gbc.score(x_train, y_train))
print("Test Set score:", gbc.score(x_test, y_test))
Training Set score: 0.974308300395257 Test Set score: 0.8661417322834646
# Prediction on Testing Data
y_pred_gbc = gbc.predict(x_test)
gbc_accuracy = metrics.accuracy_score(y_test, y_pred_gbc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_gbc))
Accuracy: 0.8661417322834646
# Prediction on Training Data
y_pred2_gbc = gbc.predict(x_train)
gbc_taccuracy = metrics.accuracy_score(y_train, y_pred2_gbc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_gbc))
Accuracy: 0.974308300395257
confusion_matrix_gbc = metrics.confusion_matrix(y_test, y_pred_gbc)
confusion_matrix_gbc
array([[ 28, 7, 0],
[ 7, 91, 11],
[ 0, 9, 101]], dtype=int64)
Confusion_Matrix_Plotter(confusion_matrix_gbc, 1)
confusion_matrix_gbc_percent = confusion_matrix_gbc.astype('float') / confusion_matrix_gbc.sum(axis=1)[:, np.newaxis]
confusion_matrix_gbc_percent
array([[0.8 , 0.2 , 0. ],
[0.06422018, 0.83486239, 0.10091743],
[0. , 0.08181818, 0.91818182]])
Confusion_Matrix_Plotter(confusion_matrix_gbc_percent, 0)
print(classification_report(y_test, y_pred_gbc, target_names=["Low Price", "Medium Price", "High Price"]))
precision recall f1-score support
Low Price 0.80 0.80 0.80 35
Medium Price 0.85 0.83 0.84 109
High Price 0.90 0.92 0.91 110
accuracy 0.87 254
macro avg 0.85 0.85 0.85 254
weighted avg 0.87 0.87 0.87 254
gbc_t1_l, gbc_t2_l, gbc_t1_m, gbc_t2_m, gbc_t1_h, gbc_t2_h = Compute_Error(confusion_matrix_gbc)
Type1_Error_LowPrice: 7 Type2_Error_LowPrice: 7 Type1_Error_MediumPrice: 16 Type2_Error_MediumPrice: 18 Type1_Error_HighPrice: 11 Type2_Error_HighPrice: 9
gbc_pl, gbc_pm, gbc_ph = precision_score(y_test, y_pred_gbc, average=None)
gbc_rl, gbc_rm, gbc_rh = recall_score(y_test, y_pred_gbc, average=None)
gbc_fl, gbc_fm, gbc_fh = f1_score(y_test, y_pred_gbc, average=None)
cv_gbc = cross_val_score(gbc, x_train, y_train, cv = 10, scoring='accuracy')
cv_gbc
array([0.8627451 , 0.83333333, 0.84158416, 0.88118812, 0.86138614,
0.9009901 , 0.87128713, 0.88118812, 0.87128713, 0.83168317])
cv_gbc_m = cv_gbc.mean()
print("Cross Validation Score:", cv_sgd_m)
Cross Validation Score: 0.7410696952048145
plot_learning_curves(x_train, y_train, x_test, y_test, gbc)
plt.show()
models_error = [('Naive Bayes', nb_t1_l, nb_t2_l, nb_t1_m, nb_t2_m, nb_t1_h, nb_t2_h),
('Support Vector Classification', svc_t1_l, svc_t2_l, svc_t1_m, svc_t2_m, svc_t1_h, svc_t2_h),
('Support Vector Classification with GridSearchCV', svc_gs_t1_l, svc_gs_t2_l, svc_gs_t1_m, svc_gs_t2_m, svc_gs_t1_h, svc_gs_t2_h),
('Logistic Regression', lr_t1_l, lr_t2_l, lr_t1_m, lr_t2_m, lr_t1_h, lr_t2_h),
('Logistic Regression with GridSearchCV', lr_gs_t1_l, lr_gs_t2_l, lr_gs_t1_m, lr_gs_t2_m, lr_gs_t1_h, lr_gs_t2_h),
('AdaBoost Classifier', abc_t1_l, abc_t2_l, abc_t1_m, abc_t2_m, abc_t1_h, abc_t2_h),
('K-Nearest Neighbors with GridSearchCV', knn_t1_l, knn_t2_l, knn_t1_m, knn_t2_m, knn_t1_h, knn_t2_h),
('Decision Trees', dt_t1_l, dt_t2_l, dt_t1_m, dt_t2_m, dt_t1_h, dt_t2_h),
('Decision Trees with GridSearchCV', dtp_t1_l, dtp_t2_l, dtp_t1_m, dtp_t2_m, dtp_t1_h, dtp_t2_h),
('Random Forest Classifier', rfc_t1_l, rfc_t2_l, rfc_t1_m, rfc_t2_m, rfc_t1_h, rfc_t2_h),
('Random Forest with RandomizedSearchCV', rfc_rscv_t1_l, rfc_rscv_t2_l, rfc_rscv_t1_m, rfc_rscv_t2_m, rfc_rscv_t1_h, rfc_rscv_t2_h),
('Neural Network (48, 24, 12)', mlp1_t1_l, mlp1_t2_l, mlp1_t1_m, mlp1_t2_m, mlp1_t1_h, mlp1_t2_h),
('Neural Network (500, 250)', mlp2_t1_l, mlp2_t2_l, mlp2_t1_m, mlp2_t2_m, mlp2_t1_h, mlp2_t2_h),
('XGBoost Classifier', xgbc_t1_l, xgbc_t2_l, xgbc_t1_m, xgbc_t2_m, xgbc_t1_h, xgbc_t2_h),
('Stochastic Gradient Descent', sgd_t1_l, sgd_t2_l, sgd_t1_m, sgd_t2_m, sgd_t1_h, sgd_t2_h),
('Gradient Boosting Classifier', gbc_t1_l, gbc_t2_l, gbc_t1_m, gbc_t2_m, gbc_t1_h, gbc_t2_h)]
error_data = pd.DataFrame(data = models_error, columns=['Model', 'Type1 Error Low Price', 'Type2 Error Low Price',
'Type1 Error Medium Price', 'Type2 Error Medium Price',
'Type1 Error High Price', 'Type2 Error High Price'])
error_data
| Model | Type1 Error Low Price | Type2 Error Low Price | Type1 Error Medium Price | Type2 Error Medium Price | Type1 Error High Price | Type2 Error High Price | |
|---|---|---|---|---|---|---|---|
| 0 | Naive Bayes | 12 | 17 | 31 | 45 | 35 | 16 |
| 1 | Support Vector Classification | 8 | 11 | 25 | 25 | 18 | 15 |
| 2 | Support Vector Classification with GridSearchCV | 8 | 10 | 22 | 25 | 18 | 13 |
| 3 | Logistic Regression | 3 | 19 | 46 | 22 | 20 | 28 |
| 4 | Logistic Regression with GridSearchCV | 6 | 13 | 38 | 23 | 18 | 26 |
| 5 | AdaBoost Classifier | 9 | 14 | 37 | 21 | 12 | 23 |
| 6 | K-Nearest Neighbors with GridSearchCV | 10 | 5 | 11 | 20 | 10 | 6 |
| 7 | Decision Trees | 7 | 5 | 20 | 20 | 14 | 16 |
| 8 | Decision Trees with GridSearchCV | 11 | 6 | 11 | 34 | 24 | 6 |
| 9 | Random Forest Classifier | 7 | 8 | 15 | 27 | 21 | 8 |
| 10 | Random Forest with RandomizedSearchCV | 8 | 8 | 14 | 29 | 22 | 7 |
| 11 | Neural Network (48, 24, 12) | 0 | 35 | 145 | 0 | 0 | 110 |
| 12 | Neural Network (500, 250) | 4 | 20 | 55 | 19 | 16 | 36 |
| 13 | XGBoost Classifier | 10 | 6 | 13 | 22 | 12 | 7 |
| 14 | Stochastic Gradient Descent | 5 | 25 | 45 | 26 | 22 | 21 |
| 15 | Gradient Boosting Classifier | 7 | 7 | 16 | 18 | 11 | 9 |
models_score = [('Naive Bayes', nb_pl, nb_pm, nb_ph, nb_rl, nb_rm, nb_rh, nb_fl, nb_fm, nb_fh),
('Support Vector Classification', svc_pl, svc_pm, svc_ph, svc_rl, svc_rm, svc_rh, svc_fl, svc_fm, svc_fh),
('Support Vector Classification with GridSearchCV', svc_gs_pl, svc_gs_pm, svc_gs_ph, svc_gs_rl, svc_gs_rm, svc_gs_rh, svc_gs_fl, svc_gs_fm, svc_gs_fh),
('Logistic Regression', lr_pl, lr_pm, lr_ph, lr_rl, lr_rm, lr_rh, lr_fl, lr_fm, lr_fh),
('Logistic Regression with GridSearchCV', lr_gs_pl, lr_gs_pm, lr_gs_ph, lr_gs_rl, lr_gs_rm, lr_gs_rh, lr_gs_fl, lr_gs_fm, lr_gs_fh),
('AdaBoost Classifier', abc_pl, abc_pm, abc_ph, abc_rl, abc_rm, abc_rh, abc_fl, abc_fm, abc_fh),
('K-Nearest Neighbors with GridSearchCV', knn_pl, knn_pm, knn_ph, knn_rl, knn_rm, knn_rh, knn_fl, knn_fm, knn_fh),
('Decision Trees', dt_pl, dt_pm, dt_ph, dt_rl, dt_rm, dt_rh, dt_fl, dt_fm, dt_fh),
('Decision Trees with GridSearchCV', dtp_pl, dtp_pm, dtp_ph, dtp_rl, dtp_rm, dtp_rh, dtp_fl, dtp_fm, dtp_fh),
('Random Forest Classifier', rfc_pl, rfc_pm, rfc_ph, rfc_rl, rfc_rm, rfc_rh, rfc_fl, rfc_fm, rfc_fh),
('Random Forest with RandomizedSearchCV', rfc_rscv_pl, rfc_rscv_pm, rfc_rscv_ph, rfc_rscv_rl, rfc_rscv_rm, rfc_rscv_rh, rfc_rscv_fl, rfc_rscv_fm, rfc_rscv_fh),
('Neural Network (48, 24, 12)', mlp1_pl, mlp1_pm, mlp1_ph, mlp1_rl, mlp1_rm, mlp1_rh, mlp1_fl, mlp1_fm, mlp1_fh),
('Neural Network (500, 250)', mlp2_pl, mlp2_pm, mlp2_ph, mlp2_rl, mlp2_rm, mlp2_rh, mlp2_fl, mlp2_fm, mlp2_fh),
('XGBoost Classifier', xgbc_pl, xgbc_pm, xgbc_ph, xgbc_rl, xgbc_rm, xgbc_rh, xgbc_fl, xgbc_fm, xgbc_fh),
('Stochastic Gradient Descent', sgd_pl, sgd_pm, sgd_ph, sgd_rl, sgd_rm, sgd_rh, sgd_fl, sgd_fm, sgd_fh),
('Gradient Boosting Classifier', gbc_pl, gbc_pm, gbc_ph, gbc_rl, gbc_rm, gbc_rh, gbc_fl, gbc_fm, gbc_fh)]
score_performance = pd.DataFrame(data=models_score,
columns=['Model', 'Precision Score Low Price', 'Precision Score Medium Price', 'Precision Score High Price',
'Recall Score Low Price', 'Recall Score Medium Price', 'Recall Score High Price',
'F1 Score Low Price', 'F1 Score Medium Price', 'F1 Score High Price'])
score_performance
| Model | Precision Score Low Price | Precision Score Medium Price | Precision Score High Price | Recall Score Low Price | Recall Score Medium Price | Recall Score High Price | F1 Score Low Price | F1 Score Medium Price | F1 Score High Price | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Naive Bayes | 0.600000 | 0.673684 | 0.728682 | 0.514286 | 0.587156 | 0.854545 | 0.553846 | 0.627451 | 0.786611 |
| 1 | Support Vector Classification | 0.750000 | 0.770642 | 0.840708 | 0.685714 | 0.770642 | 0.863636 | 0.716418 | 0.770642 | 0.852018 |
| 2 | Support Vector Classification with GridSearchCV | 0.757576 | 0.792453 | 0.843478 | 0.714286 | 0.770642 | 0.881818 | 0.735294 | 0.781395 | 0.862222 |
| 3 | Logistic Regression | 0.842105 | 0.654135 | 0.803922 | 0.457143 | 0.798165 | 0.745455 | 0.592593 | 0.719008 | 0.773585 |
| 4 | Logistic Regression with GridSearchCV | 0.785714 | 0.693548 | 0.823529 | 0.628571 | 0.788991 | 0.763636 | 0.698413 | 0.738197 | 0.792453 |
| 5 | AdaBoost Classifier | 0.700000 | 0.704000 | 0.878788 | 0.600000 | 0.807339 | 0.790909 | 0.646154 | 0.752137 | 0.832536 |
| 6 | K-Nearest Neighbors with GridSearchCV | 0.750000 | 0.890000 | 0.912281 | 0.857143 | 0.816514 | 0.945455 | 0.800000 | 0.851675 | 0.928571 |
| 7 | Decision Trees | 0.810811 | 0.816514 | 0.870370 | 0.857143 | 0.816514 | 0.854545 | 0.833333 | 0.816514 | 0.862385 |
| 8 | Decision Trees with GridSearchCV | 0.725000 | 0.872093 | 0.812500 | 0.828571 | 0.688073 | 0.945455 | 0.773333 | 0.769231 | 0.873950 |
| 9 | Random Forest Classifier | 0.794118 | 0.845361 | 0.829268 | 0.771429 | 0.752294 | 0.927273 | 0.782609 | 0.796117 | 0.875536 |
| 10 | Random Forest with RandomizedSearchCV | 0.771429 | 0.851064 | 0.824000 | 0.771429 | 0.733945 | 0.936364 | 0.771429 | 0.788177 | 0.876596 |
| 11 | Neural Network (48, 24, 12) | 0.000000 | 0.429134 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.600551 | 0.000000 |
| 12 | Neural Network (500, 250) | 0.789474 | 0.620690 | 0.822222 | 0.428571 | 0.825688 | 0.672727 | 0.555556 | 0.708661 | 0.740000 |
| 13 | XGBoost Classifier | 0.743590 | 0.870000 | 0.895652 | 0.828571 | 0.798165 | 0.936364 | 0.783784 | 0.832536 | 0.915556 |
| 14 | Stochastic Gradient Descent | 0.666667 | 0.648438 | 0.801802 | 0.285714 | 0.761468 | 0.809091 | 0.400000 | 0.700422 | 0.805430 |
| 15 | Gradient Boosting Classifier | 0.800000 | 0.850467 | 0.901786 | 0.800000 | 0.834862 | 0.918182 | 0.800000 | 0.842593 | 0.909910 |
models = [('Naive Bayes', nb_accuracy, nb_taccuracy, cv_nb_m),
('Support Vector Classification', svc_accuracy, svc_taccuracy, cv_svc_m),
('Support Vector Classification with GridSearchCV', svc_gscv_accuracy, svc_gscv_taccuracy, cv_svc_gscv_b),
('Logistic Regression', lr_accuracy, lr_taccuracy, cv_lr_m),
('Logistic Regression with GridSearchCV', lr_gs_accuracy, lr_gs_taccuracy, cv_lr_gs_b),
('AdaBoost Classifier', abc_accuracy, abc_taccuracy, cv_abc_m),
('K-Nearest Neighbors with GridSearchCV', knn_accuracy, knn_taccuracy, cv_knn_b),
('Decision Trees', dtc_accuracy, dtc_taccuracy, cv_dt_m),
('Decision Trees with GridSearchCV', dtc_pt_accuracy, dtc_pt_taccuracy, cv_dt_gscv_b),
('Random Forest Classifier', rfc_accuracy, rfc_taccuracy, cv_rfc_m),
('Random Forest with RandomizedSearchCV', rfc_rscv_accuracy, rfc_rscv_taccuracy, cv_rfc_rscv_b),
('XGBoost Classifier', xgbc_accuracy, xgbc_taccuracy, cv_xgbc_m),
('Stochastic Gradient Descent', sgd_accuracy, sgd_taccuracy, cv_sgd_m),
('Gradient Boosting Classifier', gbc_accuracy, gbc_taccuracy, cv_gbc_m),
('Neural Network (48, 24, 12)', mlp_accuracy, mlp_taccuracy, "None"),
('Neural Network (500, 250)', mlp2_accuracy, mlp2_taccuracy, "None")]
performance = pd.DataFrame(data=models, columns=['Model', 'Accuracy(Test Set)', 'Accuracy(Training Set)', 'Cross-Validation'])
performance
| Model | Accuracy(Test Set) | Accuracy(Training Set) | Cross-Validation | |
|---|---|---|---|---|
| 0 | Naive Bayes | 0.692913 | 0.708498 | 0.702631 |
| 1 | Support Vector Classification | 0.799213 | 0.832016 | 0.821171 |
| 2 | Support Vector Classification with GridSearchCV | 0.811024 | 0.833992 | 0.827082 |
| 3 | Logistic Regression | 0.728346 | 0.752964 | 0.74207 |
| 4 | Logistic Regression with GridSearchCV | 0.755906 | 0.791502 | 0.786566 |
| 5 | AdaBoost Classifier | 0.771654 | 0.784585 | 0.774714 |
| 6 | K-Nearest Neighbors with GridSearchCV | 0.877953 | 1.000000 | 0.887352 |
| 7 | Decision Trees | 0.838583 | 1.000000 | 0.836993 |
| 8 | Decision Trees with GridSearchCV | 0.818898 | 0.850791 | 0.830052 |
| 9 | Random Forest Classifier | 0.830709 | 0.859684 | 0.830062 |
| 10 | Random Forest with RandomizedSearchCV | 0.826772 | 0.861660 | 0.835022 |
| 11 | XGBoost Classifier | 0.862205 | 1.000000 | 0.872578 |
| 12 | Stochastic Gradient Descent | 0.716535 | 0.726285 | 0.74107 |
| 13 | Gradient Boosting Classifier | 0.866142 | 0.974308 | 0.863667 |
| 14 | Neural Network (48, 24, 12) | 0.429134 | 0.442688 | None |
| 15 | Neural Network (500, 250) | 0.704724 | 0.733202 | None |
performance.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16 entries, 0 to 15 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Model 16 non-null object 1 Accuracy(Test Set) 16 non-null float64 2 Accuracy(Training Set) 16 non-null float64 3 Cross-Validation 16 non-null object dtypes: float64(2), object(2) memory usage: 640.0+ bytes
performance['Cross-Validation'][:14]
0 0.702631 1 0.821171 2 0.827082 3 0.74207 4 0.786566 5 0.774714 6 0.887352 7 0.836993 8 0.830052 9 0.830062 10 0.835022 11 0.872578 12 0.74107 13 0.863667 Name: Cross-Validation, dtype: object
performance['Cross-Validation'] = performance['Cross-Validation'][:14].astype('float64')
f, axe = plt.subplots(1, 1, figsize=(10, 6))
performance.sort_values(by=['Cross-Validation'][:14], ascending=False, inplace=True)
sns.barplot(x='Cross-Validation', y='Model', data=performance[:14], ax=axe)
axe.set_xlabel('Cross-Validaton Score', size=14)
axe.set_ylabel('Models', size=14)
axe.set_xlim(0, 1.0)
axe.set_xticks(np.arange(0, 1.1, 0.1))
# plt.title("Cross-Validaton Score Plot")
plt.tight_layout()
f, axes = plt.subplots(2, 1, figsize=(12, 10))
performance.sort_values(by=['Accuracy(Training Set)'], ascending=False, inplace=True)
sns.barplot(x='Accuracy(Training Set)', y='Model', data=performance, palette='Blues_d', ax=axes[0])
axes[0].set_xlabel('Accuracy (Training Set)', size=14)
axes[0].set_ylabel('Model', size=14)
axes[0].set_xlim(0, 1.0)
axes[0].set_xticks(np.arange(0, 1.1, 0.1))
performance.sort_values(by=['Accuracy(Test Set)'], ascending=False, inplace=True)
sns.barplot(x='Accuracy(Test Set)', y='Model', data=performance, palette='Reds_d', ax=axes[1])
axes[1].set_xlabel('Accuracy (Test Set)', size=14)
axes[1].set_ylabel('Model', size=14)
axes[1].set_xlim(0, 1.0)
axes[1].set_xticks(np.arange(0, 1.1, 0.1))
# plt.title("Accuracy Plot")
plt.tight_layout()
# Sorted based on Accuracy(Test Set)
performance.sort_values(by=['Accuracy(Test Set)'], ascending=False, inplace=True)
performance
| Model | Accuracy(Test Set) | Accuracy(Training Set) | Cross-Validation | |
|---|---|---|---|---|
| 6 | K-Nearest Neighbors with GridSearchCV | 0.877953 | 1.000000 | 0.887352 |
| 13 | Gradient Boosting Classifier | 0.866142 | 0.974308 | 0.863667 |
| 11 | XGBoost Classifier | 0.862205 | 1.000000 | 0.872578 |
| 7 | Decision Trees | 0.838583 | 1.000000 | 0.836993 |
| 9 | Random Forest Classifier | 0.830709 | 0.859684 | 0.830062 |
| 10 | Random Forest with RandomizedSearchCV | 0.826772 | 0.861660 | 0.835022 |
| 8 | Decision Trees with GridSearchCV | 0.818898 | 0.850791 | 0.830052 |
| 2 | Support Vector Classification with GridSearchCV | 0.811024 | 0.833992 | 0.827082 |
| 1 | Support Vector Classification | 0.799213 | 0.832016 | 0.821171 |
| 5 | AdaBoost Classifier | 0.771654 | 0.784585 | 0.774714 |
| 4 | Logistic Regression with GridSearchCV | 0.755906 | 0.791502 | 0.786566 |
| 3 | Logistic Regression | 0.728346 | 0.752964 | 0.742070 |
| 12 | Stochastic Gradient Descent | 0.716535 | 0.726285 | 0.741070 |
| 15 | Neural Network (500, 250) | 0.704724 | 0.733202 | NaN |
| 0 | Naive Bayes | 0.692913 | 0.708498 | 0.702631 |
| 14 | Neural Network (48, 24, 12) | 0.429134 | 0.442688 | NaN |
# Sorted based on Accuracy(Training Set)
performance.sort_values(by=['Accuracy(Training Set)'], ascending=False, inplace=True)
performance
| Model | Accuracy(Test Set) | Accuracy(Training Set) | Cross-Validation | |
|---|---|---|---|---|
| 6 | K-Nearest Neighbors with GridSearchCV | 0.877953 | 1.000000 | 0.887352 |
| 11 | XGBoost Classifier | 0.862205 | 1.000000 | 0.872578 |
| 7 | Decision Trees | 0.838583 | 1.000000 | 0.836993 |
| 13 | Gradient Boosting Classifier | 0.866142 | 0.974308 | 0.863667 |
| 10 | Random Forest with RandomizedSearchCV | 0.826772 | 0.861660 | 0.835022 |
| 9 | Random Forest Classifier | 0.830709 | 0.859684 | 0.830062 |
| 8 | Decision Trees with GridSearchCV | 0.818898 | 0.850791 | 0.830052 |
| 2 | Support Vector Classification with GridSearchCV | 0.811024 | 0.833992 | 0.827082 |
| 1 | Support Vector Classification | 0.799213 | 0.832016 | 0.821171 |
| 4 | Logistic Regression with GridSearchCV | 0.755906 | 0.791502 | 0.786566 |
| 5 | AdaBoost Classifier | 0.771654 | 0.784585 | 0.774714 |
| 3 | Logistic Regression | 0.728346 | 0.752964 | 0.742070 |
| 15 | Neural Network (500, 250) | 0.704724 | 0.733202 | NaN |
| 12 | Stochastic Gradient Descent | 0.716535 | 0.726285 | 0.741070 |
| 0 | Naive Bayes | 0.692913 | 0.708498 | 0.702631 |
| 14 | Neural Network (48, 24, 12) | 0.429134 | 0.442688 | NaN |